Minor fixes to Batched DMA support

gilbertlee-amd · gilbertlee-amd · commit 2372321eb6d7 · 2026-04-11T16:01:15.000-05:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -26,7 +26,7 @@ Documentation for TransferBench is available at
 - NIC_FILTER renamed to TB_NIC_FILTER for consistency
 - DUMP_LINES renamed to TB_DUMP_LINES for consistency
 - Dynamically size CQs for NIC transfers in high QPs case
-- Switch to using hipMemcpyDevicetoDeviceNoCU instead of hipMemcpyDefault for DMA Executor if available (requires HIP >= 6.0)
+- Switch to using hipMemcpyDeviceToDeviceNoCU instead of hipMemcpyDefault for DMA Executor if available (requires HIP >= 6.0)
 - Allow for multiple destination memory locations for DMA/Batched-DMA Transfers
 
 ## v1.66.02
diff --git a/examples/example.cfg b/examples/example.cfg
@@ -12,9 +12,9 @@
 #   Executor:        SubExecutor:
 #   1) CPU           CPU thread
 #   2) GPU           GPU threadblock/Compute Unit (CU)
-#   3) DMA           N/A.                                 (May only be used for copies (single SRC/DST)
+#   3) DMA           N/A.                                 (Must have single SRC, at least one DST)
 #   4) NIC           Queue Pair
-#   5) Batched-DMA   Batch size
+#   5) Batched-DMA   Batch item                           (Must have single SRC, at least one DST)
 
 # Each single line in the configuration file defines a set of Transfers (a Test) to run in parallel
 
diff --git a/src/client/Presets/BmaSweep.hpp b/src/client/Presets/BmaSweep.hpp
@@ -31,7 +31,7 @@ int BmaSweepPreset(EnvVars&          ev,
   }
 
 #ifndef BMA_EXEC_ENABLED
-  Utils::Print("[ERROR] BMA executor requires ROCm 7.0 or newer\n");
+  Utils::Print("[ERROR] BMA executor requires ROCm 7.1 or newer\n");
   return 1;
 #endif
 
@@ -40,9 +40,11 @@ int BmaSweepPreset(EnvVars&          ev,
   // Collect env vars for this preset
   int         exeIndex      = EnvVars::GetEnvVar("EXE_INDEX"         ,               0);
   int         localCopy     = EnvVars::GetEnvVar("LOCAL_COPY"        ,               0);
+  vector<int> gfxSesList    = EnvVars::GetEnvVarArray("GFX_SUB_EXECS",              {});
   int         gpuMemTypeIdx = EnvVars::GetEnvVar("GPU_MEM_TYPE"      ,               0);
   int         numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES"   , numDetectedGpus);
-  vector<int> numSesList    = EnvVars::GetEnvVarArray("NUM_SUB_EXECS",       {1,2,4,8});
+  vector<int> bmaSesList    = EnvVars::GetEnvVarArray("NUM_SUB_EXECS",       {1,2,4,8});
+
 
   MemType gpuMemType = Utils::GetGpuMemType(gpuMemTypeIdx);
 
@@ -54,9 +56,10 @@ int BmaSweepPreset(EnvVars&          ev,
       if (!outputToCsv) printf("[BMA Sweep Related]\n");
       ev.Print("EXE_INDEX"      , exeIndex,          "Executing on GPU %d", exeIndex);
       ev.Print("LOCAL_COPY"     , localCopy,         "%s local copy to GPU %d", localCopy ? "Including" : "Excluding", exeIndex);
+      ev.Print("GFX_SUB_EXECS"  , gfxSesList.size(), EnvVars::ToStr(gfxSesList).c_str());
       ev.Print("GPU_MEM_TYPE"   , gpuMemTypeIdx,     "Using %s (%s)", Utils::GetGpuMemTypeStr(gpuMemTypeIdx).c_str(), Utils::GetAllGpuMemTypeStr().c_str());
       ev.Print("NUM_GPU_DEVICES", numGpuDevices,     "Using %d GPUs", numGpuDevices);
-      ev.Print("NUM_SUB_EXECS"  , numSesList.size(), EnvVars::ToStr(numSesList).c_str());
+      ev.Print("NUM_SUB_EXECS"  , bmaSesList.size(), EnvVars::ToStr(bmaSesList).c_str());
       printf("\n");
     }
   }
@@ -66,33 +69,40 @@ int BmaSweepPreset(EnvVars&          ev,
     return 1;
   }
 
-  int numTransfers = numGpuDevices - 1 + (localCopy ? 1 : 0);
+  int numTransfers  = numGpuDevices - 1 + (localCopy ? 1 : 0);
+  int numBmaSubExec = (int)bmaSesList.size();
+  int numGfxSubExec = (int)gfxSesList.size();
 
   TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
   TransferBench::TestResults results;
 
   // Prepare table of results
   int minPow2Exp = 12;
   int maxPow2Exp = 30;
-  int numRows    = (maxPow2Exp - minPow2Exp + 1) + 1;
-  int numCols    = 2 + numSesList.size();
+  int numRows    = 1 + (bytesSpecified ? 1 : (maxPow2Exp - minPow2Exp + 1));
+  int numCols    = 2 + numBmaSubExec + numGfxSubExec;
 
   Utils::TableHelper table(numRows, numCols);
-
-  Utils::Print("Performing %d simultaneous DMA Transfers from GPU %0 to other GPUs\n", numTransfers, exeIndex);
+  Utils::Print("Performing %d simultaneous DMA Transfers from GPU %d to other GPUs\n", numTransfers, exeIndex);
 
   // Prepare headers
+
   table.Set(0, 0, " Bytes ");
   table.Set(0, 1, " DMA ");
-  for (int i = 0; i < numSesList.size(); i++) {
-    table.Set(0, 2+i, " BMA (%d) ", numSesList[i]);
+  for (int i = 0; i < numBmaSubExec; i++) {
+    table.Set(0, 2+i, " BMA(%02d) ", bmaSesList[i]);
+  }
+  for (int i = 0; i < numGfxSubExec; i++) {
+    table.Set(0, 2+numBmaSubExec+i, " GFX(%02d) ", gfxSesList[i]);
   }
+
   table.DrawRowBorder(0);
   table.DrawRowBorder(1);
   table.DrawRowBorder(numRows);
   table.DrawColBorder(0);
   table.DrawColBorder(1);
   table.DrawColBorder(2);
+  table.DrawColBorder(2+numBmaSubExec);
   table.DrawColBorder(numCols);
 
   if (!ev.outputToCsv){
@@ -101,6 +111,8 @@ int BmaSweepPreset(EnvVars&          ev,
   };
 
   for (size_t numBytes = 1ULL<<minPow2Exp, currRow=1; numBytes <= (1ULL<<maxPow2Exp); numBytes<<=1, currRow++) {
+    if (bytesSpecified) numBytes = numBytesPerTransfer;
+
     if (!ev.outputToCsv) {
       Utils::Print(".");
       fflush(stdout);
@@ -132,8 +144,8 @@ int BmaSweepPreset(EnvVars&          ev,
 
     // BMA executor next
     t.exeDevice = {EXE_GPU_BDMA, exeIndex};
-    for (int i = 0; i < numSesList.size(); i++) {
-      t.numSubExecs = numSesList[i];
+    for (int i = 0; i < numBmaSubExec; i++) {
+      t.numSubExecs = bmaSesList[i];
 
       if (!TransferBench::RunTransfers(cfg, transfers, results)) {
         for (auto const& err : results.errResults)
@@ -143,6 +155,21 @@ int BmaSweepPreset(EnvVars&          ev,
 
       table.Set(currRow, 2+i, " %6.2f ", numTransfers * results.tfrResults[0].avgBandwidthGbPerSec);
     }
+
+    // GFX executor last
+    t.exeDevice = {EXE_GPU_GFX, exeIndex};
+    for (int i = 0; i < numGfxSubExec; i++) {
+      t.numSubExecs = gfxSesList[i];
+
+      if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+        for (auto const& err : results.errResults)
+          Utils::Print("%s\n", err.errMsg.c_str());
+        return 1;
+      }
+
+      table.Set(currRow, 2+numBmaSubExec+i, " %6.2f ", results.tfrResults[0].avgBandwidthGbPerSec);
+    }
+    if (bytesSpecified) break;
   }
 
   if (!ev.outputToCsv) {
diff --git a/src/header/TransferBench.hpp b/src/header/TransferBench.hpp
@@ -78,8 +78,10 @@ THE SOFTWARE.
 #endif
 /// @endcond
 
-// Batched DMA executor is only supported with HIP >= 7.0
-#if defined(__HIP_PLATFORM_AMD__) && defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >= 7)
+// Batched DMA executor is only supported with HIP >= 7.1
+#if defined(__HIP_PLATFORM_AMD__) && \
+    defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >= 7) && \
+    defined(HIP_VERSION_MINOR) && (HIP_VERSION_MINOR >= 1)
 #define BMA_EXEC_ENABLED
 #endif
 
@@ -104,7 +106,7 @@ namespace TransferBench
     EXE_GPU_DMA      = 2,                       ///<  GPU SDMA executor         (subExecutor = not supported)
     EXE_NIC          = 3,                       ///<  NIC RDMA executor         (subExecutor = queue pair)
     EXE_NIC_NEAREST  = 4,                       ///<  NIC RDMA nearest executor (subExecutor = queue pair)
-    EXE_GPU_BDMA     = 5,                       ///<  GPU Batched SDMA execttor (subExecutor = batch size)
+    EXE_GPU_BDMA     = 5,                       ///<  GPU Batched SDMA executor (subExecutor = batch item)
   };
   char const ExeTypeStr[7] = "CGDINB";
   inline bool IsCpuExeType(ExeType e){ return e == EXE_CPU; }
@@ -2170,7 +2172,7 @@ namespace {
       }
 
       if (t.numBytes % 4) {
-        errors.push_back({ERR_FATAL, "Transfer %d: numBytes must be a multiple of 4\n", t.numBytes});
+        errors.push_back({ERR_FATAL, "Transfer %d: numBytes (%lu) must be a multiple of 4\n", i, t.numBytes});
         break;
       }
 
@@ -2418,7 +2420,7 @@ namespace {
         break;
 #else
         errors.push_back({ERR_FATAL,
-            "Transfer %d: BMA executor requires ROCm 7.0 or newer (AMD HIP with hipMemcpyBatchAsync)", i});
+            "Transfer %d: BMA executor requires ROCm 7.1 or newer (AMD HIP with hipMemcpyBatchAsync)", i});
         hasFatalError = true;
         break;
 #endif
@@ -3890,11 +3892,13 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
     rss.batchSrcs.clear();
     rss.batchBytes.clear();
 
-    for (int i = 0; i < transfer.numSubExecs; ++i) {
-      for (int j = 0; j < (int)rss.dstMem.size(); j++) {
-        rss.batchSrcs.push_back(subExecParam[i].src[0]);
-        rss.batchDsts.push_back(subExecParam[i].dst[j]);
-        rss.batchBytes.push_back(subExecParam[i].N * sizeof(float));
+    if (transfer.exeDevice.exeType == EXE_GPU_BMDA) {
+      for (int i = 0; i < transfer.numSubExecs; ++i) {
+        for (int j = 0; j < (int)rss.dstMem.size(); j++) {
+          rss.batchSrcs.push_back(subExecParam[i].src[0]);
+          rss.batchDsts.push_back(subExecParam[i].dst[j]);
+          rss.batchBytes.push_back(subExecParam[i].N * sizeof(float));
+        }
       }
     }
 #endif
@@ -6601,7 +6605,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       topo.numExecutorSubIndices[{EXE_GPU_BDMA, exeIndex}] = 0;
       topo.numSubExecutors[{EXE_GPU_GFX, exeIndex}] = numDeviceCUs;
       topo.numSubExecutors[{EXE_GPU_DMA, exeIndex}] = 1;
-      topo.numSubExecutors[{EXE_GPU_DMA, exeIndex}] = numDmaEngines;
+      topo.numSubExecutors[{EXE_GPU_BDMA, exeIndex}] = numDmaEngines;
       topo.closestCpuNumaToGpu[exeIndex] = closestNuma;
       topo.closestNicsToGpu[exeIndex] = {};
     }