From e10073777a70809eef2cc92250f111dce6f4ace6 Mon Sep 17 00:00:00 2001
From: Gilbert Lee <gilbert.lee@amd.com>
Date: Fri, 10 Apr 2026 16:18:41 -0500
Subject: [PATCH 1/4] Adding Batched DMA support (hipMemcpyBatchAsync), and
 bmasweep preset

---
 CHANGELOG.md                    |  20 +-
 examples/example.cfg            |   4 +-
 src/client/Presets/BmaSweep.hpp | 155 +++++++++++++++
 src/client/Presets/Presets.hpp  |   4 +-
 src/client/Utilities.hpp        |  13 +-
 src/header/TransferBench.hpp    | 333 ++++++++++++++++++++++++++------
 6 files changed, 455 insertions(+), 74 deletions(-)
 create mode 100644 src/client/Presets/BmaSweep.hpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 280fd82..cb79743 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,16 +14,20 @@ Documentation for TransferBench is available at
 - Adding NIC_CQ_POLL_BATCH to control CQ poll batch size for NIC transfers
 - New "hbm" preset which sweeps and tests local HBM read performance
 - Added a new TB_WALLCLOCK_RATE that will override GPU GFX wallclock rate if it returns 0 (debug)
+- Adding new batched-DMA executor "B", which utilizes the hipMemcpyBatchAsync API introduced in HIP 7.0
+- Added new bmasweep preset that compares DMA to batched DMA execution for parallel transfers to other GPUs
 
 ### Modified
-  - DMA-BUF support enablement in CMake changed to ENABLE_DMA_BUF to be more similar to other compile-time options
-  - Adding extra information to CMake and make build methods to indicate enabled / disabled features
-  - a2asweep preset changes from USE_FINE_GRAIN to MEM_TYPE to reflect various memory types
-  - a2asweep preset changes from NUM_CUS to NUM_SUB_EXECS to match with a2a preset naming convention
-  - scaling preset changes from using USE_FINE_GRAIN to CPU_MEM_TYPE and GPU_MEM_TYPE
-  - NIC_FILTER renamed to TB_NIC_FILTER for consistency
-  - DUMP_LINES renamed to TB_DUMP_LINES for consistency
-  - Dynamically size CQs for NIC transfers in high QPs case
+- DMA-BUF support enablement in CMake changed to ENABLE_DMA_BUF to be more similar to other compile-time options
+- Adding extra information to CMake and make build methods to indicate enabled / disabled features
+- a2asweep preset changes from USE_FINE_GRAIN to MEM_TYPE to reflect various memory types
+- a2asweep preset changes from NUM_CUS to NUM_SUB_EXECS to match with a2a preset naming convention
+- scaling preset changes from using USE_FINE_GRAIN to CPU_MEM_TYPE and GPU_MEM_TYPE
+- NIC_FILTER renamed to TB_NIC_FILTER for consistency
+- DUMP_LINES renamed to TB_DUMP_LINES for consistency
+- Dynamically size CQs for NIC transfers in high QPs case
+- Switch to using hipMemcpyDevicetoDeviceNoCU instead of hipMemcpyDefault for DMA Executor if available (requires HIP >= 6.0)
+- Allow for multiple destination memory locations for DMA/Batched-DMA Transfers
 
 ## v1.66.02
 ### Added
diff --git a/examples/example.cfg b/examples/example.cfg
index 57d4ae0..9c5e29f 100644
--- a/examples/example.cfg
+++ b/examples/example.cfg
@@ -8,12 +8,13 @@
 #                SRC 1 -> Executor -> DST 1
 #                SRC X                DST Y
 
-# Three Executors are supported by TransferBench
+# Five Executors are supported by TransferBench
 #   Executor:        SubExecutor:
 #   1) CPU           CPU thread
 #   2) GPU           GPU threadblock/Compute Unit (CU)
 #   3) DMA           N/A.                                 (May only be used for copies (single SRC/DST)
 #   4) NIC           Queue Pair
+#   5) Batched-DMA   Batch size
 
 # Each single line in the configuration file defines a set of Transfers (a Test) to run in parallel
 
@@ -38,6 +39,7 @@
 #                 - C:    CPU-executed          (Indexed from 0 to # NUMA nodes - 1)
 #                 - G:    GPU-executed          (Indexed from 0 to # GPUs - 1)
 #                 - D:    DMA-executor          (Indexed from 0 to # GPUs - 1)
+#                 - B:    Batched-DMA-executor  (Indexed from 0 to # GPUs - 1)
 #                 - I#.#: NIC executor          (Indexed from 0 to # NICs - 1)
 #                 - N#.#: Nearest NIC executor  (Indexed from 0 to # GPUs - 1)
 #   dstMemL   :   Destination memory locations (Where the data is to be written to)
diff --git a/src/client/Presets/BmaSweep.hpp b/src/client/Presets/BmaSweep.hpp
new file mode 100644
index 0000000..cb28f82
--- /dev/null
+++ b/src/client/Presets/BmaSweep.hpp
@@ -0,0 +1,155 @@
+/*
+Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+int BmaSweepPreset(EnvVars&          ev,
+                   size_t      const numBytesPerTransfer,
+                   std::string const presetName,
+                   bool        const bytesSpecified)
+{
+  if (TransferBench::GetNumRanks() > 1) {
+    Utils::Print("[ERROR] BMA sweep preset currently not supported for multi-node\n");
+    return 1;
+  }
+
+#ifndef BMA_EXEC_ENABLED
+  Utils::Print("[ERROR] BMA executor requires ROCm 7.0 or newer\n");
+  return 1;
+#endif
+
+  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+
+  // Collect env vars for this preset
+  int         exeIndex      = EnvVars::GetEnvVar("EXE_INDEX"         ,               0);
+  int         localCopy     = EnvVars::GetEnvVar("LOCAL_COPY"        ,               0);
+  int         gpuMemTypeIdx = EnvVars::GetEnvVar("GPU_MEM_TYPE"      ,               0);
+  int         numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES"   , numDetectedGpus);
+  vector<int> numSesList    = EnvVars::GetEnvVarArray("NUM_SUB_EXECS",       {1,2,4,8});
+
+  MemType gpuMemType = Utils::GetGpuMemType(gpuMemTypeIdx);
+
+  // Display environment variables
+  if (Utils::RankDoesOutput()) {
+    ev.DisplayEnvVars();
+    if (!ev.hideEnv) {
+      int outputToCsv = ev.outputToCsv;
+      if (!outputToCsv) printf("[BMA Sweep Related]\n");
+      ev.Print("EXE_INDEX"      , exeIndex,          "Executing on GPU %d", exeIndex);
+      ev.Print("LOCAL_COPY"     , localCopy,         "%s local copy to GPU %d", localCopy ? "Including" : "Excluding", exeIndex);
+      ev.Print("GPU_MEM_TYPE"   , gpuMemTypeIdx,     "Using %s (%s)", Utils::GetGpuMemTypeStr(gpuMemTypeIdx).c_str(), Utils::GetAllGpuMemTypeStr().c_str());
+      ev.Print("NUM_GPU_DEVICES", numGpuDevices,     "Using %d GPUs", numGpuDevices);
+      ev.Print("NUM_SUB_EXECS"  , numSesList.size(), EnvVars::ToStr(numSesList).c_str());
+      printf("\n");
+    }
+  }
+
+  if (exeIndex < 0 || exeIndex >= numGpuDevices) {
+    Utils::Print("EXE_INDEX must be between 0 and %d inclusively\n", numGpuDevices - 1);
+    return 1;
+  }
+
+  int numTransfers = numGpuDevices - 1 + (localCopy ? 1 : 0);
+
+  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+  TransferBench::TestResults results;
+
+  // Prepare table of results
+  int minPow2Exp = 12;
+  int maxPow2Exp = 30;
+  int numRows    = (maxPow2Exp - minPow2Exp + 1) + 1;
+  int numCols    = 2 + numSesList.size();
+
+  Utils::TableHelper table(numRows, numCols);
+
+  Utils::Print("Performing %d simultaneous DMA Transfers from GPU %0 to other GPUs\n", numTransfers, exeIndex);
+
+  // Prepare headers
+  table.Set(0, 0, " Bytes ");
+  table.Set(0, 1, " DMA ");
+  for (int i = 0; i < numSesList.size(); i++) {
+    table.Set(0, 2+i, " BMA (%d) ", numSesList[i]);
+  }
+  table.DrawRowBorder(0);
+  table.DrawRowBorder(1);
+  table.DrawRowBorder(numRows);
+  table.DrawColBorder(0);
+  table.DrawColBorder(1);
+  table.DrawColBorder(2);
+  table.DrawColBorder(numCols);
+
+  if (!ev.outputToCsv){
+    Utils::Print("Executing: ");
+    fflush(stdout);
+  };
+
+  for (size_t numBytes = 1ULL<<minPow2Exp, currRow=1; numBytes <= (1ULL<<maxPow2Exp); numBytes<<=1, currRow++) {
+    if (!ev.outputToCsv) {
+      Utils::Print(".");
+      fflush(stdout);
+    }
+
+    table.Set(currRow, 0, " %lu ", numBytes);
+    std::vector<Transfer> transfers(1);
+
+    Transfer& t = transfers[0];
+    t.numBytes = numBytes;
+    t.srcs     = {{gpuMemType, exeIndex}};
+    t.dsts.clear();
+    for (int i = 0; i < numGpuDevices; i++) {
+      if (i == exeIndex && localCopy == 0) continue;
+      t.dsts.push_back({gpuMemType, i});
+    }
+
+    // DMA executor first
+    t.exeDevice = {EXE_GPU_DMA, exeIndex};
+    t.numSubExecs = 1;
+
+    if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+      for (auto const& err : results.errResults)
+        Utils::Print("%s\n", err.errMsg.c_str());
+      return 1;
+    }
+
+    table.Set(currRow, 1, " %6.2f ", numTransfers * results.tfrResults[0].avgBandwidthGbPerSec);
+
+    // BMA executor next
+    t.exeDevice = {EXE_GPU_BDMA, exeIndex};
+    for (int i = 0; i < numSesList.size(); i++) {
+      t.numSubExecs = numSesList[i];
+
+      if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+        for (auto const& err : results.errResults)
+          Utils::Print("%s\n", err.errMsg.c_str());
+        return 1;
+      }
+
+      table.Set(currRow, 2+i, " %6.2f ", numTransfers * results.tfrResults[0].avgBandwidthGbPerSec);
+    }
+  }
+
+  if (!ev.outputToCsv) {
+    Utils::Print("\n");
+  }
+  table.PrintTable(ev.outputToCsv, ev.showBorders);
+  Utils::Print("Reported numbers are all GB/s, normalized for per Transfer for %d Transfers\n", numTransfers);
+
+  return 0;
+}
diff --git a/src/client/Presets/Presets.hpp b/src/client/Presets/Presets.hpp
index d856c6a..6b2dfd6 100644
--- a/src/client/Presets/Presets.hpp
+++ b/src/client/Presets/Presets.hpp
@@ -30,6 +30,7 @@ THE SOFTWARE.
 #include "AllToAll.hpp"
 #include "AllToAllN.hpp"
 #include "AllToAllSweep.hpp"
+#include "BmaSweep.hpp"
 #include "GfxSweep.hpp"
 #include "HbmBandwidth.hpp"
 #include "HealthCheck.hpp"
@@ -53,7 +54,8 @@ std::map<std::string, std::pair<PresetFunc, std::string>> presetFuncMap =
   {"a2a",         {AllToAllPreset,      "Tests parallel transfers between all pairs of GPU devices"}},
   {"a2a_n",       {AllToAllRdmaPreset,  "Tests parallel transfers between all pairs of GPU devices using Nearest NIC RDMA transfers"}},
   {"a2asweep",    {AllToAllSweepPreset, "Test GFX-based all-to-all transfers swept across different CU and GFX unroll counts"}},
-  {"gfxsweep",    {GfxSweepPreset,      "Sweep BLOCKSIZES, UNROLLS, and NUM_SUB_EXECS for one GFX transfer (GFX_SWEEP_TRANSFER)"}},
+  {"bmasweep",    {BmaSweepPreset,      "Test and compare batched DMA executor for multi destination copies"}},
+  {"gfxsweep",    {GfxSweepPreset,      "Sweep over various GFX kernel options for a given GFX Transfer"}},
   {"hbm",         {HbmBandwidthPreset,  "Tests HBM bandwidth"}},
   {"healthcheck", {HealthCheckPreset,   "Simple bandwidth health check (MI300X series only)"}},
   {"nicrings",    {NicRingsPreset,      "Tests NIC rings created across identical NIC indices across ranks"}},
diff --git a/src/client/Utilities.hpp b/src/client/Utilities.hpp
index 9b7cfb8..381d97c 100644
--- a/src/client/Utilities.hpp
+++ b/src/client/Utilities.hpp
@@ -393,12 +393,13 @@ namespace TransferBench::Utils
   std::string ExeTypeToStr(ExeType exeType)
   {
     switch (exeType) {
-    case EXE_CPU:         return "CPU";
-    case EXE_GPU_GFX:     return "GPU";
-    case EXE_GPU_DMA:     return "DMA";
-    case EXE_NIC:         return "NIC";
-    case EXE_NIC_NEAREST: return "NIC";
-    default:              return "N/A";
+    case EXE_CPU:           return "CPU";
+    case EXE_GPU_GFX:       return "GPU";
+    case EXE_GPU_DMA:       return "DMA";
+    case EXE_NIC:           return "NIC";
+    case EXE_NIC_NEAREST:   return "NIC";
+    case EXE_GPU_BDMA:      return "BMA";
+    default:                return "N/A";
     }
   }
 
diff --git a/src/header/TransferBench.hpp b/src/header/TransferBench.hpp
index 5f535e3..11efbd9 100644
--- a/src/header/TransferBench.hpp
+++ b/src/header/TransferBench.hpp
@@ -78,6 +78,11 @@ THE SOFTWARE.
 #endif
 /// @endcond
 
+// Batched DMA executor is only supported with HIP >= 7.0
+#if defined(__HIP_PLATFORM_AMD__) && defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >= 7)
+#define BMA_EXEC_ENABLED
+#endif
+
 namespace TransferBench
 {
   using std::map;
@@ -98,11 +103,12 @@ namespace TransferBench
     EXE_GPU_GFX      = 1,                       ///<  GPU kernel-based executor (subExecutor = threadblock/CU)
     EXE_GPU_DMA      = 2,                       ///<  GPU SDMA executor         (subExecutor = not supported)
     EXE_NIC          = 3,                       ///<  NIC RDMA executor         (subExecutor = queue pair)
-    EXE_NIC_NEAREST  = 4                        ///<  NIC RDMA nearest executor (subExecutor = queue pair)
+    EXE_NIC_NEAREST  = 4,                       ///<  NIC RDMA nearest executor (subExecutor = queue pair)
+    EXE_GPU_BDMA     = 5,                       ///<  GPU Batched SDMA execttor (subExecutor = batch size)
   };
-  char const ExeTypeStr[6] = "CGDIN";
+  char const ExeTypeStr[7] = "CGDINB";
   inline bool IsCpuExeType(ExeType e){ return e == EXE_CPU; }
-  inline bool IsGpuExeType(ExeType e){ return e == EXE_GPU_GFX || e == EXE_GPU_DMA; }
+  inline bool IsGpuExeType(ExeType e){ return e == EXE_GPU_GFX || e == EXE_GPU_DMA || e == EXE_GPU_BDMA; }
   inline bool IsNicExeType(ExeType e){ return e == EXE_NIC || e == EXE_NIC_NEAREST; }
 
   /**
@@ -621,6 +627,7 @@ namespace TransferBench
   #define hipErrorPeerAccessAlreadyEnabled                   cudaErrorPeerAccessAlreadyEnabled
   #define hipFuncCachePreferShared                           cudaFuncCachePreferShared
   #define hipMemcpyDefault                                   cudaMemcpyDefault
+  #define hipMemcpyKind                                      cudaMemcpyKind
   #define hipMemcpyDeviceToHost                              cudaMemcpyDeviceToHost
   #define hipMemcpyHostToDevice                              cudaMemcpyHostToDevice
   #define hipSuccess                                         cudaSuccess
@@ -1061,7 +1068,7 @@ namespace {
     // Topology related
     struct RankTopology
     {
-      char hostname[33];
+      char    hostname[33];
       char    ppodId[16];
       int64_t vpodId;
 
@@ -2162,10 +2169,15 @@ namespace {
         break;
       }
 
+      if (t.numBytes % 4) {
+        errors.push_back({ERR_FATAL, "Transfer %d: numBytes must be a multiple of 4\n", t.numBytes});
+        break;
+      }
+
       // Each subexecutor is assigned a multiple of cfg.data.blockBytes, however this may
       // mean that some subexecutors might not have any work assigned to them if the amount to
       // transfer is small
-      if (t.exeDevice.exeType == EXE_GPU_GFX || t.exeDevice.exeType == EXE_CPU) {
+      if (t.exeDevice.exeType == EXE_GPU_GFX || t.exeDevice.exeType == EXE_CPU || t.exeDevice.exeType == EXE_GPU_BDMA) {
         size_t const N               = t.numBytes / sizeof(float);
         int    const targetMultiple  = cfg.data.blockBytes / sizeof(float);
         int    const maxSubExecToUse = std::min((size_t)(N + targetMultiple - 1) / targetMultiple,
@@ -2250,9 +2262,15 @@ namespace {
         }
         break;
       case EXE_GPU_DMA:
-        if (t.srcs.size() != 1 || t.dsts.size() != 1) {
+        if (t.srcs.size() != 1) {
+          errors.push_back({ERR_FATAL,
+                            "Transfer %d: DMA executor must have exactly 1 source", i});
+          hasFatalError = true;
+          break;
+        }
+        if (t.dsts.size() < 1) {
           errors.push_back({ERR_FATAL,
-                            "Transfer %d: DMA executor must have exactly 1 source and 1 destination", i});
+                            "Transfer %d: DMA executor must have at least 1 destination", i});
           hasFatalError = true;
           break;
         }
@@ -2294,19 +2312,10 @@ namespace {
             }
 
           }
-          err = System::Get().GetHsaAgent(t.dsts[0], dstAgent);
-          if (err.errType != ERR_NONE) {
-            errors.push_back(err);
-            if (err.errType == ERR_FATAL) {
-              hasFatalError = true;
-              break;
-            }
-          }
 
-          // Skip check of engine Id mask for self copies
-          if (srcAgent.handle != dstAgent.handle) {
-            uint32_t engineIdMask = 0;
-            err = hsa_amd_memory_copy_engine_status(dstAgent, srcAgent, &engineIdMask);
+          int numDsts = (int)t.dsts.size();
+          for (int dstIdx = 0; dstIdx < numDsts; dstIdx++) {
+            err = System::Get().GetHsaAgent(t.dsts[dstIdx], dstAgent);
             if (err.errType != ERR_NONE) {
               errors.push_back(err);
               if (err.errType == ERR_FATAL) {
@@ -2314,15 +2323,29 @@ namespace {
                 break;
               }
             }
-            hsa_amd_sdma_engine_id_t sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << t.exeSubIndex);
-            if (!(sdmaEngineId & engineIdMask)) {
-              errors.push_back({ERR_FATAL,
-                  "Transfer %d: DMA %d.%d does not exist or cannot copy between src/dst",
-                  i, t.exeDevice.exeIndex, t.exeSubIndex});
-              hasFatalError = true;
-              break;
+
+            // Skip check of engine Id mask for self copies
+            if (srcAgent.handle != dstAgent.handle) {
+              uint32_t engineIdMask = 0;
+              err = hsa_amd_memory_copy_engine_status(dstAgent, srcAgent, &engineIdMask);
+              if (err.errType != ERR_NONE) {
+                errors.push_back(err);
+                if (err.errType == ERR_FATAL) {
+                  hasFatalError = true;
+                  break;
+                }
+              }
+              hsa_amd_sdma_engine_id_t sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << t.exeSubIndex);
+              if (!(sdmaEngineId & engineIdMask)) {
+                errors.push_back({ERR_FATAL,
+                    "Transfer %d: DMA %d.%d does not exist or cannot copy between src/dst",
+                    i, t.exeDevice.exeIndex, t.exeSubIndex});
+                hasFatalError = true;
+                break;
+              }
             }
           }
+          if (hasFatalError) break;
 #endif
         }
 
@@ -2345,6 +2368,60 @@ namespace {
           }
         }
         break;
+      case EXE_GPU_BDMA:
+#ifdef BMA_EXEC_ENABLED
+        if (t.srcs.size() != 1) {
+          errors.push_back({ERR_FATAL,
+                            "Transfer %d: BMA executor must have exactly 1 source", i});
+          hasFatalError = true;
+          break;
+        }
+        if (t.dsts.size() < 1) {
+          errors.push_back({ERR_FATAL,
+                            "Transfer %d: BMA executor must have at least 1 destination", i});
+          hasFatalError = true;
+          break;
+        }
+
+        if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numExecutors) {
+          errors.push_back({ERR_FATAL,
+                            "Transfer %d: BMA index must be between 0 and %d (instead of %d) for rank %d",
+                            i, numExecutors - 1, t.exeDevice.exeIndex, t.exeDevice.exeRank});
+          hasFatalError = true;
+          break;
+        }
+
+        if (t.exeSubIndex != -1) {
+          errors.push_back({ERR_FATAL,
+              "Transfer %d: BMA executor does not support executor subindices (SDMA engine selection)", i});
+          hasFatalError = true;
+          break;
+        }
+
+        if (!IsGpuMemType(t.srcs[0].memType) && !IsGpuMemType(t.dsts[0].memType)) {
+          errors.push_back({ERR_WARN,
+              "Transfer %d: No GPU memory for source or destination.  Copy might not execute on BMA %d",
+              i, t.exeDevice.exeIndex});
+        } else {
+          if (IsGpuMemType(t.srcs[0].memType)) {
+            if (t.srcs[0].memIndex != t.exeDevice.exeIndex) {
+              errors.push_back({ERR_WARN,
+                  "Transfer %d: BMA executor may use the source memory device (%d) not (%d)",
+                  i, t.srcs[0].memIndex, t.exeDevice.exeIndex});
+            }
+          } else if (t.dsts[0].memIndex != t.exeDevice.exeIndex) {
+            errors.push_back({ERR_WARN,
+                "Transfer %d: BMA executor may use the destination memory device (%d) not (%d)",
+                i, t.dsts[0].memIndex, t.exeDevice.exeIndex});
+          }
+        }
+        break;
+#else
+        errors.push_back({ERR_FATAL,
+            "Transfer %d: BMA executor requires ROCm 7.0 or newer (AMD HIP with hipMemcpyBatchAsync)", i});
+        hasFatalError = true;
+        break;
+#endif
       case EXE_NIC: case EXE_NIC_NEAREST:
 #ifdef NIC_EXEC_ENABLED
       {
@@ -2521,6 +2598,15 @@ namespace {
                             "DMA %d copies will fallback to blit (GFX) kernels", exeDevice.exeIndex});
         break;
       }
+      case EXE_GPU_BDMA:
+      {
+        if (transferCount[exeDevice] > gpuMaxHwQueues) {
+          errors.push_back({ERR_WARN,
+                           "BMA %d attempting %d parallel transfers, however GPU_MAX_HW_QUEUES only set to %d",
+                           exeDevice.exeIndex, transferCount[exeDevice], gpuMaxHwQueues});
+        }
+        break;
+      }
       default:
         break;
       }
@@ -2578,13 +2664,13 @@ namespace {
 
     // For targeted-SDMA
 #if !defined(__NVCC__)
-    hsa_agent_t                dstAgent;          ///< DMA destination memory agent
+    vector<hsa_agent_t>        dstAgent;          ///< DMA destination memory agents
     hsa_agent_t                srcAgent;          ///< DMA source memory agent
     hsa_signal_t               signal;            ///< HSA signal for completion
     hsa_amd_sdma_engine_id_t   sdmaEngineId;      ///< DMA engine ID
 #endif
 
-// For IBV executor
+    // For IBV executor
 #ifdef NIC_EXEC_ENABLED
     int                        srcNicIndex;       ///< SRC NIC index
     int                        dstNicIndex;       ///< DST NIC index
@@ -2612,6 +2698,13 @@ namespace {
     vector<vector<ibv_send_wr>>sendWorkRequests;  ///< Send work requests per queue pair
 #endif
 
+    // For BMA executor
+#ifdef BMA_EXEC_ENABLED
+    vector<void*>              batchDsts;
+    vector<void*>              batchSrcs;
+    vector<size_t>             batchBytes;
+#endif
+
     // Counters
     double                     totalDurationMsec; ///< Total duration for all iterations for this Transfer
     vector<double>             perIterMsec;       ///< Duration for each individual iteration
@@ -3791,6 +3884,21 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       }
     }
 
+#ifdef BMA_EXEC_ENABLED
+    // Prepare src/dst pointers for batched DMA executor
+    rss.batchDsts.clear();
+    rss.batchSrcs.clear();
+    rss.batchBytes.clear();
+
+    for (int i = 0; i < transfer.numSubExecs; ++i) {
+      for (int j = 0; j < (int)rss.dstMem.size(); j++) {
+        rss.batchSrcs.push_back(subExecParam[i].src[0]);
+        rss.batchDsts.push_back(subExecParam[i].dst[j]);
+        rss.batchBytes.push_back(subExecParam[i].N * sizeof(float));
+      }
+    }
+#endif
+
     // Clear counters
     rss.totalDurationMsec = 0.0;
 
@@ -3926,8 +4034,12 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
         // Collect HSA agent information
         hsa_amd_pointer_info_t info;
         info.size = sizeof(info);
-        ERR_CHECK(hsa_amd_pointer_info(rss.dstMem[0], &info, NULL, NULL, NULL));
-        rss.dstAgent = info.agentOwner;
+        int numDst = (int)rss.dstMem.size();
+        rss.dstAgent.resize(numDst);
+        for (int dstIdx = 0; dstIdx < numDst; dstIdx++) {
+          ERR_CHECK(hsa_amd_pointer_info(rss.dstMem[dstIdx], &info, NULL, NULL, NULL));
+          rss.dstAgent[dstIdx] = info.agentOwner;
+        }
 
         ERR_CHECK(hsa_amd_pointer_info(rss.srcMem[0], &info, NULL, NULL, NULL));
         rss.srcAgent = info.agentOwner;
@@ -3945,11 +4057,12 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
     }
 
     // Prepare additional requirements for GPU-based executors
-    if ((exeDevice.exeType == EXE_GPU_GFX || exeDevice.exeType == EXE_GPU_DMA) && exeDevice.exeRank == localRank) {
+    if ((exeDevice.exeType == EXE_GPU_GFX || exeDevice.exeType == EXE_GPU_DMA || exeDevice.exeType == EXE_GPU_BDMA)
+        && exeDevice.exeRank == localRank) {
       ERR_CHECK(hipSetDevice(exeDevice.exeIndex));
 
       // Determine how many streams to use
-      int const numStreamsToUse = (exeDevice.exeType == EXE_GPU_DMA ||
+      int const numStreamsToUse = (exeDevice.exeType == EXE_GPU_DMA || exeDevice.exeType == EXE_GPU_BDMA ||
                                   (exeDevice.exeType == EXE_GPU_GFX && cfg.gfx.useMultiStream))
                                   ? exeInfo.resources.size() : 1;
       exeInfo.streams.resize(numStreamsToUse);
@@ -4147,7 +4260,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
     }
 
     // Teardown additional requirements for GPU-based executors
-    if ((exeDevice.exeType == EXE_GPU_GFX || exeDevice.exeType == EXE_GPU_DMA) && exeDevice.exeRank == localRank) {
+    if ((exeDevice.exeType == EXE_GPU_GFX || exeDevice.exeType == EXE_GPU_DMA || exeDevice.exeType == EXE_GPU_BDMA)
+        && exeDevice.exeRank == localRank) {
       for (auto stream : exeInfo.streams)
         ERR_CHECK(hipStreamDestroy(stream));
       if (cfg.gfx.useHipEvents || cfg.dma.useHipEvents) {
@@ -4893,22 +5007,33 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
   {
     auto cpuStart = std::chrono::high_resolution_clock::now();
 
+    int numDsts = (int)resources.dstMem.size();
     ERR_CHECK(hipSetDevice(exeIndex));
     int subIterations = 0;
     if (!useSubIndices && !cfg.dma.useHsaCopy) {
       if (cfg.dma.useHipEvents)
         ERR_CHECK(hipEventRecord(startEvent, stream));
 
+      // Force the use of SDMA engine if possible
+#if defined(__HIP_PLATFORM_AMD__) && defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >= 6)
+      hipMemcpyKind memcpyKind = hipMemcpyDeviceToDeviceNoCU;
+#else
+      hipMemcpyKind memcpyKind = hipMemcpyDefault;
+#endif
+
       // Use DMA copy engine
       do {
+        // Queue for each output location
+        for (int dstIdx = 0; dstIdx < numDsts; dstIdx++) {
 #if defined(__NVCC__)
-        ERR_CHECK(cuMemcpyAsync((CUdeviceptr)resources.dstMem[0],
-                                (CUdeviceptr)resources.srcMem[0],
-                                resources.numBytes, stream));
+          ERR_CHECK(cuMemcpyAsync((CUdeviceptr)resources.dstMem[dstIdx],
+                                  (CUdeviceptr)resources.srcMem[0],
+                                  resources.numBytes, stream));
 #else
-        ERR_CHECK(hipMemcpyAsync(resources.dstMem[0], resources.srcMem[0], resources.numBytes,
-                                 hipMemcpyDefault, stream));
+          ERR_CHECK(hipMemcpyAsync(resources.dstMem[dstIdx], resources.srcMem[0], resources.numBytes,
+                                   memcpyKind, stream));
 #endif
+        }
       } while (++subIterations != cfg.general.numSubIterations);
 
       if (cfg.dma.useHipEvents)
@@ -4920,20 +5045,22 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
 #else
       // Use HSA async copy
       do {
-        hsa_signal_store_screlease(resources.signal, 1);
-        if (!useSubIndices) {
-          ERR_CHECK(hsa_amd_memory_async_copy(resources.dstMem[0], resources.dstAgent,
-                                              resources.srcMem[0], resources.srcAgent,
-                                              resources.numBytes, 0, NULL,
-                                              resources.signal));
-        } else {
-          HSA_CALL(hsa_amd_memory_async_copy_on_engine(resources.dstMem[0], resources.dstAgent,
-                                                       resources.srcMem[0], resources.srcAgent,
-                                                       resources.numBytes, 0, NULL,
-                                                       resources.signal,
-                                                       resources.sdmaEngineId, true));
+        hsa_signal_store_screlease(resources.signal, numDsts);
+        for (int dstIdx = 0; dstIdx < numDsts; dstIdx++) {
+          if (!useSubIndices) {
+            ERR_CHECK(hsa_amd_memory_async_copy(resources.dstMem[dstIdx], resources.dstAgent[dstIdx],
+                                                resources.srcMem[0], resources.srcAgent,
+                                                resources.numBytes, 0, NULL,
+                                                resources.signal));
+          } else {
+            HSA_CALL(hsa_amd_memory_async_copy_on_engine(resources.dstMem[dstIdx], resources.dstAgent[dstIdx],
+                                                         resources.srcMem[0], resources.srcAgent,
+                                                         resources.numBytes, 0, NULL,
+                                                         resources.signal,
+                                                         resources.sdmaEngineId, true));
+          }
         }
-        // Wait for SDMA transfer to complete
+        // Wait for SDMA transfer(s) to complete
         while(hsa_signal_wait_scacquire(resources.signal,
                                         HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
                                         HSA_WAIT_STATE_ACTIVE) >= 1);
@@ -4990,6 +5117,88 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
     return ERR_NONE;
   }
 
+// BMA Executor-related functions
+//========================================================================================
+#ifdef BMA_EXEC_ENABLED
+  // Execute a single BMA Transfer (one hipMemcpyBatchAsync per sub-iteration; each subexecutor is one batch entry)
+  static ErrResult ExecuteBatchDmaTransfer(int           const  iteration,
+                                           int           const  exeIndex,
+                                           hipStream_t   const  stream,
+                                           hipEvent_t    const  startEvent,
+                                           hipEvent_t    const  stopEvent,
+                                           ConfigOptions const& cfg,
+                                           TransferResources&   resources)
+  {
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+
+    ERR_CHECK(hipSetDevice(exeIndex));
+
+    int subIterations = 0;
+    if (cfg.dma.useHipEvents)
+      ERR_CHECK(hipEventRecord(startEvent, stream));
+
+    size_t failIdx = 0;
+    do {
+      ERR_CHECK(hipMemcpyBatchAsync(resources.batchDsts.data(),
+                                    resources.batchSrcs.data(),
+                                    resources.batchBytes.data(),
+                                    resources.batchDsts.size(),
+                                    nullptr, nullptr, 0, &failIdx, stream));
+    } while (++subIterations != cfg.general.numSubIterations);
+
+    if (cfg.dma.useHipEvents)
+      ERR_CHECK(hipEventRecord(stopEvent, stream));
+    ERR_CHECK(hipStreamSynchronize(stream));
+
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations;
+
+    if (iteration >= 0) {
+      double deltaMsec = cpuDeltaMsec;
+      if (cfg.dma.useHipEvents) {
+        float gpuDeltaMsec;
+        ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
+        deltaMsec = gpuDeltaMsec / cfg.general.numSubIterations;
+      }
+      resources.totalDurationMsec += deltaMsec;
+      if (cfg.general.recordPerIteration)
+        resources.perIterMsec.push_back(deltaMsec);
+    }
+    return ERR_NONE;
+  }
+
+  static ErrResult RunBmaExecutor(int           const  iteration,
+                                  ConfigOptions const& cfg,
+                                  int           const  exeIndex,
+                                  ExeInfo&             exeInfo)
+  {
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+    ERR_CHECK(hipSetDevice(exeIndex));
+
+    vector<std::future<ErrResult>> asyncTransfers;
+    for (int i = 0; i < exeInfo.resources.size(); i++) {
+      asyncTransfers.emplace_back(std::async(std::launch::async,
+                                             ExecuteBatchDmaTransfer,
+                                             iteration,
+                                             exeIndex,
+                                             exeInfo.streams[i],
+                                             cfg.dma.useHipEvents ? exeInfo.startEvents[i] : NULL,
+                                             cfg.dma.useHipEvents ? exeInfo.stopEvents[i]  : NULL,
+                                             std::cref(cfg),
+                                             std::ref(exeInfo.resources[i])));
+    }
+
+    for (auto& asyncTransfer : asyncTransfers)
+      ERR_CHECK(asyncTransfer.get());
+
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations;
+    if (iteration >= 0)
+      exeInfo.totalDurationMsec += deltaMsec;
+    return ERR_NONE;
+  }
+#endif // BMA_EXEC_ENABLED
+
 // Executor-related functions
 //========================================================================================
   static ErrResult RunExecutor(int           const  iteration,
@@ -4998,13 +5207,16 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
                                ExeInfo&             exeInfo)
   {
     switch (exeDevice.exeType) {
-    case EXE_CPU:     return RunCpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
-    case EXE_GPU_GFX: return RunGpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
-    case EXE_GPU_DMA: return RunDmaExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
+    case EXE_CPU:           return RunCpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
+    case EXE_GPU_GFX:       return RunGpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
+    case EXE_GPU_DMA:       return RunDmaExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
 #ifdef NIC_EXEC_ENABLED
-    case EXE_NIC:     return RunNicExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
+    case EXE_NIC:           return RunNicExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
+#endif
+#ifdef BMA_EXEC_ENABLED
+    case EXE_GPU_BDMA: return RunBmaExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
 #endif
-    default:          return {ERR_FATAL, "Unsupported executor (%d)", exeDevice.exeType};
+    default:            return {ERR_FATAL, "Unsupported executor (%d)", exeDevice.exeType};
     }
   }
 
@@ -5609,7 +5821,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
         result |= RecursiveWildcardTransferExpansion(wc, baseRankIndex, numBytes, numSubExecs, transfers);
         wc.exe.exeSubIndices[0] = -2;
         return result;
-      case EXE_GPU_GFX: case EXE_GPU_DMA:
+      case EXE_GPU_GFX: case EXE_GPU_DMA: case EXE_GPU_BDMA:
       {
         // Iterate over all available subindices
         ExeDevice exeDevice = {wc.exe.exeType, wc.exe.exeIndices[0], wc.exe.exeRanks[0], 0};
@@ -6338,6 +6550,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
     if (status != hipSuccess) numGpus = 0;
     topo.numExecutors[EXE_GPU_GFX] = numGpus;
     topo.numExecutors[EXE_GPU_DMA] = numGpus;
+    topo.numExecutors[EXE_GPU_BDMA] = numGpus;
 
     for (int exeIndex = 0; exeIndex < numGpus; exeIndex++) {
       int numDeviceCUs  = 0;
@@ -6356,6 +6569,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       }
       topo.executorName[{EXE_GPU_GFX, exeIndex}] = gpuName;
       topo.executorName[{EXE_GPU_DMA, exeIndex}] = gpuName;
+      topo.executorName[{EXE_GPU_BDMA, exeIndex}] = gpuName;
 
 #if !defined(__NVCC__)
       hsa_agent_t gpuAgent = gpuAgents[exeIndex];
@@ -6384,8 +6598,10 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
 #endif
       topo.numExecutorSubIndices[{EXE_GPU_GFX, exeIndex}] = numXccs;
       topo.numExecutorSubIndices[{EXE_GPU_DMA, exeIndex}] = numDmaEngines;
+      topo.numExecutorSubIndices[{EXE_GPU_BDMA, exeIndex}] = 0;
       topo.numSubExecutors[{EXE_GPU_GFX, exeIndex}] = numDeviceCUs;
       topo.numSubExecutors[{EXE_GPU_DMA, exeIndex}] = 1;
+      topo.numSubExecutors[{EXE_GPU_DMA, exeIndex}] = numDmaEngines;
       topo.closestCpuNumaToGpu[exeIndex] = closestNuma;
       topo.closestNicsToGpu[exeIndex] = {};
     }
@@ -6805,7 +7021,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
         return {ERR_FATAL, "CPU index must be between 0 and %d inclusively", numCpus - 1};
       agent = cpuAgents[exeDevice.exeIndex];
       break;
-    case EXE_GPU_GFX: case EXE_GPU_DMA:
+    case EXE_GPU_GFX: case EXE_GPU_DMA: case EXE_GPU_BDMA:
       if (exeIndex < 0 || exeIndex >= numGpus)
         return {ERR_FATAL, "GPU index must be between 0 and %d inclusively", numGpus - 1};
       agent = gpuAgents[exeIndex];
@@ -7164,6 +7380,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
 #undef hipErrorPeerAccessAlreadyEnabled
 #undef hipFuncCachePreferShared
 #undef hipMemcpyDefault
+#undef hipMemcpyKind
 #undef hipMemcpyDeviceToHost
 #undef hipMemcpyHostToDevice
 #undef hipSuccess

From 2372321eb6d71dd963a3e18d623d078565aa7f7d Mon Sep 17 00:00:00 2001
From: Gilbert Lee <gilbert.lee@amd.com>
Date: Fri, 10 Apr 2026 20:29:01 -0500
Subject: [PATCH 2/4] Minor fixes to Batched DMA support

---
 CHANGELOG.md                    |  2 +-
 examples/example.cfg            |  4 +--
 src/client/Presets/BmaSweep.hpp | 51 +++++++++++++++++++++++++--------
 src/header/TransferBench.hpp    | 26 ++++++++++-------
 4 files changed, 57 insertions(+), 26 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cb79743..2a4d775 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -26,7 +26,7 @@ Documentation for TransferBench is available at
 - NIC_FILTER renamed to TB_NIC_FILTER for consistency
 - DUMP_LINES renamed to TB_DUMP_LINES for consistency
 - Dynamically size CQs for NIC transfers in high QPs case
-- Switch to using hipMemcpyDevicetoDeviceNoCU instead of hipMemcpyDefault for DMA Executor if available (requires HIP >= 6.0)
+- Switch to using hipMemcpyDeviceToDeviceNoCU instead of hipMemcpyDefault for DMA Executor if available (requires HIP >= 6.0)
 - Allow for multiple destination memory locations for DMA/Batched-DMA Transfers
 
 ## v1.66.02
diff --git a/examples/example.cfg b/examples/example.cfg
index 9c5e29f..14df7e3 100644
--- a/examples/example.cfg
+++ b/examples/example.cfg
@@ -12,9 +12,9 @@
 #   Executor:        SubExecutor:
 #   1) CPU           CPU thread
 #   2) GPU           GPU threadblock/Compute Unit (CU)
-#   3) DMA           N/A.                                 (May only be used for copies (single SRC/DST)
+#   3) DMA           N/A.                                 (Must have single SRC, at least one DST)
 #   4) NIC           Queue Pair
-#   5) Batched-DMA   Batch size
+#   5) Batched-DMA   Batch item                           (Must have single SRC, at least one DST)
 
 # Each single line in the configuration file defines a set of Transfers (a Test) to run in parallel
 
diff --git a/src/client/Presets/BmaSweep.hpp b/src/client/Presets/BmaSweep.hpp
index cb28f82..6ef1df8 100644
--- a/src/client/Presets/BmaSweep.hpp
+++ b/src/client/Presets/BmaSweep.hpp
@@ -31,7 +31,7 @@ int BmaSweepPreset(EnvVars&          ev,
   }
 
 #ifndef BMA_EXEC_ENABLED
-  Utils::Print("[ERROR] BMA executor requires ROCm 7.0 or newer\n");
+  Utils::Print("[ERROR] BMA executor requires ROCm 7.1 or newer\n");
   return 1;
 #endif
 
@@ -40,9 +40,11 @@ int BmaSweepPreset(EnvVars&          ev,
   // Collect env vars for this preset
   int         exeIndex      = EnvVars::GetEnvVar("EXE_INDEX"         ,               0);
   int         localCopy     = EnvVars::GetEnvVar("LOCAL_COPY"        ,               0);
+  vector<int> gfxSesList    = EnvVars::GetEnvVarArray("GFX_SUB_EXECS",              {});
   int         gpuMemTypeIdx = EnvVars::GetEnvVar("GPU_MEM_TYPE"      ,               0);
   int         numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES"   , numDetectedGpus);
-  vector<int> numSesList    = EnvVars::GetEnvVarArray("NUM_SUB_EXECS",       {1,2,4,8});
+  vector<int> bmaSesList    = EnvVars::GetEnvVarArray("NUM_SUB_EXECS",       {1,2,4,8});
+
 
   MemType gpuMemType = Utils::GetGpuMemType(gpuMemTypeIdx);
 
@@ -54,9 +56,10 @@ int BmaSweepPreset(EnvVars&          ev,
       if (!outputToCsv) printf("[BMA Sweep Related]\n");
       ev.Print("EXE_INDEX"      , exeIndex,          "Executing on GPU %d", exeIndex);
       ev.Print("LOCAL_COPY"     , localCopy,         "%s local copy to GPU %d", localCopy ? "Including" : "Excluding", exeIndex);
+      ev.Print("GFX_SUB_EXECS"  , gfxSesList.size(), EnvVars::ToStr(gfxSesList).c_str());
       ev.Print("GPU_MEM_TYPE"   , gpuMemTypeIdx,     "Using %s (%s)", Utils::GetGpuMemTypeStr(gpuMemTypeIdx).c_str(), Utils::GetAllGpuMemTypeStr().c_str());
       ev.Print("NUM_GPU_DEVICES", numGpuDevices,     "Using %d GPUs", numGpuDevices);
-      ev.Print("NUM_SUB_EXECS"  , numSesList.size(), EnvVars::ToStr(numSesList).c_str());
+      ev.Print("NUM_SUB_EXECS"  , bmaSesList.size(), EnvVars::ToStr(bmaSesList).c_str());
       printf("\n");
     }
   }
@@ -66,7 +69,9 @@ int BmaSweepPreset(EnvVars&          ev,
     return 1;
   }
 
-  int numTransfers = numGpuDevices - 1 + (localCopy ? 1 : 0);
+  int numTransfers  = numGpuDevices - 1 + (localCopy ? 1 : 0);
+  int numBmaSubExec = (int)bmaSesList.size();
+  int numGfxSubExec = (int)gfxSesList.size();
 
   TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
   TransferBench::TestResults results;
@@ -74,25 +79,30 @@ int BmaSweepPreset(EnvVars&          ev,
   // Prepare table of results
   int minPow2Exp = 12;
   int maxPow2Exp = 30;
-  int numRows    = (maxPow2Exp - minPow2Exp + 1) + 1;
-  int numCols    = 2 + numSesList.size();
+  int numRows    = 1 + (bytesSpecified ? 1 : (maxPow2Exp - minPow2Exp + 1));
+  int numCols    = 2 + numBmaSubExec + numGfxSubExec;
 
   Utils::TableHelper table(numRows, numCols);
-
-  Utils::Print("Performing %d simultaneous DMA Transfers from GPU %0 to other GPUs\n", numTransfers, exeIndex);
+  Utils::Print("Performing %d simultaneous DMA Transfers from GPU %d to other GPUs\n", numTransfers, exeIndex);
 
   // Prepare headers
+
   table.Set(0, 0, " Bytes ");
   table.Set(0, 1, " DMA ");
-  for (int i = 0; i < numSesList.size(); i++) {
-    table.Set(0, 2+i, " BMA (%d) ", numSesList[i]);
+  for (int i = 0; i < numBmaSubExec; i++) {
+    table.Set(0, 2+i, " BMA(%02d) ", bmaSesList[i]);
+  }
+  for (int i = 0; i < numGfxSubExec; i++) {
+    table.Set(0, 2+numBmaSubExec+i, " GFX(%02d) ", gfxSesList[i]);
   }
+
   table.DrawRowBorder(0);
   table.DrawRowBorder(1);
   table.DrawRowBorder(numRows);
   table.DrawColBorder(0);
   table.DrawColBorder(1);
   table.DrawColBorder(2);
+  table.DrawColBorder(2+numBmaSubExec);
   table.DrawColBorder(numCols);
 
   if (!ev.outputToCsv){
@@ -101,6 +111,8 @@ int BmaSweepPreset(EnvVars&          ev,
   };
 
   for (size_t numBytes = 1ULL<<minPow2Exp, currRow=1; numBytes <= (1ULL<<maxPow2Exp); numBytes<<=1, currRow++) {
+    if (bytesSpecified) numBytes = numBytesPerTransfer;
+
     if (!ev.outputToCsv) {
       Utils::Print(".");
       fflush(stdout);
@@ -132,8 +144,8 @@ int BmaSweepPreset(EnvVars&          ev,
 
     // BMA executor next
     t.exeDevice = {EXE_GPU_BDMA, exeIndex};
-    for (int i = 0; i < numSesList.size(); i++) {
-      t.numSubExecs = numSesList[i];
+    for (int i = 0; i < numBmaSubExec; i++) {
+      t.numSubExecs = bmaSesList[i];
 
       if (!TransferBench::RunTransfers(cfg, transfers, results)) {
         for (auto const& err : results.errResults)
@@ -143,6 +155,21 @@ int BmaSweepPreset(EnvVars&          ev,
 
       table.Set(currRow, 2+i, " %6.2f ", numTransfers * results.tfrResults[0].avgBandwidthGbPerSec);
     }
+
+    // GFX executor last
+    t.exeDevice = {EXE_GPU_GFX, exeIndex};
+    for (int i = 0; i < numGfxSubExec; i++) {
+      t.numSubExecs = gfxSesList[i];
+
+      if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+        for (auto const& err : results.errResults)
+          Utils::Print("%s\n", err.errMsg.c_str());
+        return 1;
+      }
+
+      table.Set(currRow, 2+numBmaSubExec+i, " %6.2f ", results.tfrResults[0].avgBandwidthGbPerSec);
+    }
+    if (bytesSpecified) break;
   }
 
   if (!ev.outputToCsv) {
diff --git a/src/header/TransferBench.hpp b/src/header/TransferBench.hpp
index 11efbd9..815a0b8 100644
--- a/src/header/TransferBench.hpp
+++ b/src/header/TransferBench.hpp
@@ -78,8 +78,10 @@ THE SOFTWARE.
 #endif
 /// @endcond
 
-// Batched DMA executor is only supported with HIP >= 7.0
-#if defined(__HIP_PLATFORM_AMD__) && defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >= 7)
+// Batched DMA executor is only supported with HIP >= 7.1
+#if defined(__HIP_PLATFORM_AMD__) && \
+    defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >= 7) && \
+    defined(HIP_VERSION_MINOR) && (HIP_VERSION_MINOR >= 1)
 #define BMA_EXEC_ENABLED
 #endif
 
@@ -104,7 +106,7 @@ namespace TransferBench
     EXE_GPU_DMA      = 2,                       ///<  GPU SDMA executor         (subExecutor = not supported)
     EXE_NIC          = 3,                       ///<  NIC RDMA executor         (subExecutor = queue pair)
     EXE_NIC_NEAREST  = 4,                       ///<  NIC RDMA nearest executor (subExecutor = queue pair)
-    EXE_GPU_BDMA     = 5,                       ///<  GPU Batched SDMA execttor (subExecutor = batch size)
+    EXE_GPU_BDMA     = 5,                       ///<  GPU Batched SDMA executor (subExecutor = batch item)
   };
   char const ExeTypeStr[7] = "CGDINB";
   inline bool IsCpuExeType(ExeType e){ return e == EXE_CPU; }
@@ -2170,7 +2172,7 @@ namespace {
       }
 
       if (t.numBytes % 4) {
-        errors.push_back({ERR_FATAL, "Transfer %d: numBytes must be a multiple of 4\n", t.numBytes});
+        errors.push_back({ERR_FATAL, "Transfer %d: numBytes (%lu) must be a multiple of 4\n", i, t.numBytes});
         break;
       }
 
@@ -2418,7 +2420,7 @@ namespace {
         break;
 #else
         errors.push_back({ERR_FATAL,
-            "Transfer %d: BMA executor requires ROCm 7.0 or newer (AMD HIP with hipMemcpyBatchAsync)", i});
+            "Transfer %d: BMA executor requires ROCm 7.1 or newer (AMD HIP with hipMemcpyBatchAsync)", i});
         hasFatalError = true;
         break;
 #endif
@@ -3890,11 +3892,13 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
     rss.batchSrcs.clear();
     rss.batchBytes.clear();
 
-    for (int i = 0; i < transfer.numSubExecs; ++i) {
-      for (int j = 0; j < (int)rss.dstMem.size(); j++) {
-        rss.batchSrcs.push_back(subExecParam[i].src[0]);
-        rss.batchDsts.push_back(subExecParam[i].dst[j]);
-        rss.batchBytes.push_back(subExecParam[i].N * sizeof(float));
+    if (transfer.exeDevice.exeType == EXE_GPU_BMDA) {
+      for (int i = 0; i < transfer.numSubExecs; ++i) {
+        for (int j = 0; j < (int)rss.dstMem.size(); j++) {
+          rss.batchSrcs.push_back(subExecParam[i].src[0]);
+          rss.batchDsts.push_back(subExecParam[i].dst[j]);
+          rss.batchBytes.push_back(subExecParam[i].N * sizeof(float));
+        }
       }
     }
 #endif
@@ -6601,7 +6605,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       topo.numExecutorSubIndices[{EXE_GPU_BDMA, exeIndex}] = 0;
       topo.numSubExecutors[{EXE_GPU_GFX, exeIndex}] = numDeviceCUs;
       topo.numSubExecutors[{EXE_GPU_DMA, exeIndex}] = 1;
-      topo.numSubExecutors[{EXE_GPU_DMA, exeIndex}] = numDmaEngines;
+      topo.numSubExecutors[{EXE_GPU_BDMA, exeIndex}] = numDmaEngines;
       topo.closestCpuNumaToGpu[exeIndex] = closestNuma;
       topo.closestNicsToGpu[exeIndex] = {};
     }

From 8bab3a2e7deefb232062e287c8c28b71a10034bf Mon Sep 17 00:00:00 2001
From: Gilbert Lee <gilbert.lee@amd.com>
Date: Sat, 11 Apr 2026 15:59:06 -0500
Subject: [PATCH 3/4] Fixing up typos / bugs

---
 CHANGELOG.md                 | 2 +-
 src/header/TransferBench.hpp | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2a4d775..d0dd559 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,7 +14,7 @@ Documentation for TransferBench is available at
 - Adding NIC_CQ_POLL_BATCH to control CQ poll batch size for NIC transfers
 - New "hbm" preset which sweeps and tests local HBM read performance
 - Added a new TB_WALLCLOCK_RATE that will override GPU GFX wallclock rate if it returns 0 (debug)
-- Adding new batched-DMA executor "B", which utilizes the hipMemcpyBatchAsync API introduced in HIP 7.0
+- Adding new batched-DMA executor "B", which utilizes the hipMemcpyBatchAsync API introduced in HIP 7.1
 - Added new bmasweep preset that compares DMA to batched DMA execution for parallel transfers to other GPUs
 
 ### Modified
diff --git a/src/header/TransferBench.hpp b/src/header/TransferBench.hpp
index 815a0b8..e7e1720 100644
--- a/src/header/TransferBench.hpp
+++ b/src/header/TransferBench.hpp
@@ -79,9 +79,8 @@ THE SOFTWARE.
 /// @endcond
 
 // Batched DMA executor is only supported with HIP >= 7.1
-#if defined(__HIP_PLATFORM_AMD__) && \
-    defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >= 7) && \
-    defined(HIP_VERSION_MINOR) && (HIP_VERSION_MINOR >= 1)
+#if defined(__HIP_PLATFORM_AMD__) && defined(HIP_VERSION_MAJOR) && defined(HIP_VERSION_MINOR) && \
+  ((HIP_VERSION_MAJOR > 7) || (HIP_VERSION_MAJOR == 7 && HIP_VERSION_MINOR >= 1))
 #define BMA_EXEC_ENABLED
 #endif
 
@@ -3892,7 +3891,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
     rss.batchSrcs.clear();
     rss.batchBytes.clear();
 
-    if (transfer.exeDevice.exeType == EXE_GPU_BMDA) {
+    if (transfer.exeDevice.exeType == EXE_GPU_BDMA) {
       for (int i = 0; i < transfer.numSubExecs; ++i) {
         for (int j = 0; j < (int)rss.dstMem.size(); j++) {
           rss.batchSrcs.push_back(subExecParam[i].src[0]);

From e55d2a58f93efb05c5192aecb1fc450a19604295 Mon Sep 17 00:00:00 2001
From: Gilbert Lee <gilbert.lee@amd.com>
Date: Mon, 13 Apr 2026 23:49:19 -0500
Subject: [PATCH 4/4] Adding support for cudaMemcpyBatchAsync

---
 src/header/TransferBench.hpp | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/header/TransferBench.hpp b/src/header/TransferBench.hpp
index e7e1720..aff26a5 100644
--- a/src/header/TransferBench.hpp
+++ b/src/header/TransferBench.hpp
@@ -78,9 +78,8 @@ THE SOFTWARE.
 #endif
 /// @endcond
 
-// Batched DMA executor is only supported with HIP >= 7.1
-#if defined(__HIP_PLATFORM_AMD__) && defined(HIP_VERSION_MAJOR) && defined(HIP_VERSION_MINOR) && \
-  ((HIP_VERSION_MAJOR > 7) || (HIP_VERSION_MAJOR == 7 && HIP_VERSION_MINOR >= 1))
+// Batched DMA executor is only supported with HIP >= 7.1 and CUDA 12.8
+#if (defined(HIP_VERSION) && (HIP_VERSION >= 710)) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12080))
 #define BMA_EXEC_ENABLED
 #endif
 
@@ -659,6 +658,7 @@ namespace TransferBench
   #define hipMallocManaged                                   cudaMallocManaged
   #define hipMemcpy                                          cudaMemcpy
   #define hipMemcpyAsync                                     cudaMemcpyAsync
+  #define hipMemcpyBatchAsync                                cudaMemcpyBatchAsync
   #define hipMemset                                          cudaMemset
   #define hipMemsetAsync                                     cudaMemsetAsync
   #define hipSetDevice                                       cudaSetDevice
@@ -2701,9 +2701,9 @@ namespace {
 
     // For BMA executor
 #ifdef BMA_EXEC_ENABLED
-    vector<void*>              batchDsts;
-    vector<void*>              batchSrcs;
-    vector<size_t>             batchBytes;
+    vector<void*>              batchDsts;         ///< Destination pointers (per batch item)
+    vector<void*>              batchSrcs;         ///< Source pointers (per batch item)
+    vector<size_t>             batchBytes;        ///< Bytes to copy (per batch item)
 #endif
 
     // Counters
@@ -3890,7 +3890,6 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
     rss.batchDsts.clear();
     rss.batchSrcs.clear();
     rss.batchBytes.clear();
-
     if (transfer.exeDevice.exeType == EXE_GPU_BDMA) {
       for (int i = 0; i < transfer.numSubExecs; ++i) {
         for (int j = 0; j < (int)rss.dstMem.size(); j++) {
@@ -5146,7 +5145,11 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
                                     resources.batchSrcs.data(),
                                     resources.batchBytes.data(),
                                     resources.batchDsts.size(),
-                                    nullptr, nullptr, 0, &failIdx, stream));
+                                    nullptr, nullptr, 0,
+#if !defined(__NVCC__)
+                                    &failIdx,
+#endif
+                                    stream));
     } while (++subIterations != cfg.general.numSubIterations);
 
     if (cfg.dma.useHipEvents)