From e10073777a70809eef2cc92250f111dce6f4ace6 Mon Sep 17 00:00:00 2001 From: Gilbert Lee Date: Fri, 10 Apr 2026 16:18:41 -0500 Subject: [PATCH 1/4] Adding Batched DMA support (hipMemcpyBatchAsync), and bmasweep preset --- CHANGELOG.md | 20 +- examples/example.cfg | 4 +- src/client/Presets/BmaSweep.hpp | 155 +++++++++++++++ src/client/Presets/Presets.hpp | 4 +- src/client/Utilities.hpp | 13 +- src/header/TransferBench.hpp | 333 ++++++++++++++++++++++++++------ 6 files changed, 455 insertions(+), 74 deletions(-) create mode 100644 src/client/Presets/BmaSweep.hpp diff --git a/CHANGELOG.md b/CHANGELOG.md index 280fd82..cb79743 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,16 +14,20 @@ Documentation for TransferBench is available at - Adding NIC_CQ_POLL_BATCH to control CQ poll batch size for NIC transfers - New "hbm" preset which sweeps and tests local HBM read performance - Added a new TB_WALLCLOCK_RATE that will override GPU GFX wallclock rate if it returns 0 (debug) +- Adding new batched-DMA executor "B", which utilizes the hipMemcpyBatchAsync API introduced in HIP 7.0 +- Added new bmasweep preset that compares DMA to batched DMA execution for parallel transfers to other GPUs ### Modified - - DMA-BUF support enablement in CMake changed to ENABLE_DMA_BUF to be more similar to other compile-time options - - Adding extra information to CMake and make build methods to indicate enabled / disabled features - - a2asweep preset changes from USE_FINE_GRAIN to MEM_TYPE to reflect various memory types - - a2asweep preset changes from NUM_CUS to NUM_SUB_EXECS to match with a2a preset naming convention - - scaling preset changes from using USE_FINE_GRAIN to CPU_MEM_TYPE and GPU_MEM_TYPE - - NIC_FILTER renamed to TB_NIC_FILTER for consistency - - DUMP_LINES renamed to TB_DUMP_LINES for consistency - - Dynamically size CQs for NIC transfers in high QPs case +- DMA-BUF support enablement in CMake changed to ENABLE_DMA_BUF to be more similar to other compile-time options +- Adding extra information to CMake and make build methods to indicate enabled / disabled features +- a2asweep preset changes from USE_FINE_GRAIN to MEM_TYPE to reflect various memory types +- a2asweep preset changes from NUM_CUS to NUM_SUB_EXECS to match with a2a preset naming convention +- scaling preset changes from using USE_FINE_GRAIN to CPU_MEM_TYPE and GPU_MEM_TYPE +- NIC_FILTER renamed to TB_NIC_FILTER for consistency +- DUMP_LINES renamed to TB_DUMP_LINES for consistency +- Dynamically size CQs for NIC transfers in high QPs case +- Switch to using hipMemcpyDevicetoDeviceNoCU instead of hipMemcpyDefault for DMA Executor if available (requires HIP >= 6.0) +- Allow for multiple destination memory locations for DMA/Batched-DMA Transfers ## v1.66.02 ### Added diff --git a/examples/example.cfg b/examples/example.cfg index 57d4ae0..9c5e29f 100644 --- a/examples/example.cfg +++ b/examples/example.cfg @@ -8,12 +8,13 @@ # SRC 1 -> Executor -> DST 1 # SRC X DST Y -# Three Executors are supported by TransferBench +# Five Executors are supported by TransferBench # Executor: SubExecutor: # 1) CPU CPU thread # 2) GPU GPU threadblock/Compute Unit (CU) # 3) DMA N/A. (May only be used for copies (single SRC/DST) # 4) NIC Queue Pair +# 5) Batched-DMA Batch size # Each single line in the configuration file defines a set of Transfers (a Test) to run in parallel @@ -38,6 +39,7 @@ # - C: CPU-executed (Indexed from 0 to # NUMA nodes - 1) # - G: GPU-executed (Indexed from 0 to # GPUs - 1) # - D: DMA-executor (Indexed from 0 to # GPUs - 1) +# - B: Batched-DMA-executor (Indexed from 0 to # GPUs - 1) # - I#.#: NIC executor (Indexed from 0 to # NICs - 1) # - N#.#: Nearest NIC executor (Indexed from 0 to # GPUs - 1) # dstMemL : Destination memory locations (Where the data is to be written to) diff --git a/src/client/Presets/BmaSweep.hpp b/src/client/Presets/BmaSweep.hpp new file mode 100644 index 0000000..cb28f82 --- /dev/null +++ b/src/client/Presets/BmaSweep.hpp @@ -0,0 +1,155 @@ +/* +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +int BmaSweepPreset(EnvVars& ev, + size_t const numBytesPerTransfer, + std::string const presetName, + bool const bytesSpecified) +{ + if (TransferBench::GetNumRanks() > 1) { + Utils::Print("[ERROR] BMA sweep preset currently not supported for multi-node\n"); + return 1; + } + +#ifndef BMA_EXEC_ENABLED + Utils::Print("[ERROR] BMA executor requires ROCm 7.0 or newer\n"); + return 1; +#endif + + int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX); + + // Collect env vars for this preset + int exeIndex = EnvVars::GetEnvVar("EXE_INDEX" , 0); + int localCopy = EnvVars::GetEnvVar("LOCAL_COPY" , 0); + int gpuMemTypeIdx = EnvVars::GetEnvVar("GPU_MEM_TYPE" , 0); + int numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES" , numDetectedGpus); + vector numSesList = EnvVars::GetEnvVarArray("NUM_SUB_EXECS", {1,2,4,8}); + + MemType gpuMemType = Utils::GetGpuMemType(gpuMemTypeIdx); + + // Display environment variables + if (Utils::RankDoesOutput()) { + ev.DisplayEnvVars(); + if (!ev.hideEnv) { + int outputToCsv = ev.outputToCsv; + if (!outputToCsv) printf("[BMA Sweep Related]\n"); + ev.Print("EXE_INDEX" , exeIndex, "Executing on GPU %d", exeIndex); + ev.Print("LOCAL_COPY" , localCopy, "%s local copy to GPU %d", localCopy ? "Including" : "Excluding", exeIndex); + ev.Print("GPU_MEM_TYPE" , gpuMemTypeIdx, "Using %s (%s)", Utils::GetGpuMemTypeStr(gpuMemTypeIdx).c_str(), Utils::GetAllGpuMemTypeStr().c_str()); + ev.Print("NUM_GPU_DEVICES", numGpuDevices, "Using %d GPUs", numGpuDevices); + ev.Print("NUM_SUB_EXECS" , numSesList.size(), EnvVars::ToStr(numSesList).c_str()); + printf("\n"); + } + } + + if (exeIndex < 0 || exeIndex >= numGpuDevices) { + Utils::Print("EXE_INDEX must be between 0 and %d inclusively\n", numGpuDevices - 1); + return 1; + } + + int numTransfers = numGpuDevices - 1 + (localCopy ? 1 : 0); + + TransferBench::ConfigOptions cfg = ev.ToConfigOptions(); + TransferBench::TestResults results; + + // Prepare table of results + int minPow2Exp = 12; + int maxPow2Exp = 30; + int numRows = (maxPow2Exp - minPow2Exp + 1) + 1; + int numCols = 2 + numSesList.size(); + + Utils::TableHelper table(numRows, numCols); + + Utils::Print("Performing %d simultaneous DMA Transfers from GPU %0 to other GPUs\n", numTransfers, exeIndex); + + // Prepare headers + table.Set(0, 0, " Bytes "); + table.Set(0, 1, " DMA "); + for (int i = 0; i < numSesList.size(); i++) { + table.Set(0, 2+i, " BMA (%d) ", numSesList[i]); + } + table.DrawRowBorder(0); + table.DrawRowBorder(1); + table.DrawRowBorder(numRows); + table.DrawColBorder(0); + table.DrawColBorder(1); + table.DrawColBorder(2); + table.DrawColBorder(numCols); + + if (!ev.outputToCsv){ + Utils::Print("Executing: "); + fflush(stdout); + }; + + for (size_t numBytes = 1ULL< transfers(1); + + Transfer& t = transfers[0]; + t.numBytes = numBytes; + t.srcs = {{gpuMemType, exeIndex}}; + t.dsts.clear(); + for (int i = 0; i < numGpuDevices; i++) { + if (i == exeIndex && localCopy == 0) continue; + t.dsts.push_back({gpuMemType, i}); + } + + // DMA executor first + t.exeDevice = {EXE_GPU_DMA, exeIndex}; + t.numSubExecs = 1; + + if (!TransferBench::RunTransfers(cfg, transfers, results)) { + for (auto const& err : results.errResults) + Utils::Print("%s\n", err.errMsg.c_str()); + return 1; + } + + table.Set(currRow, 1, " %6.2f ", numTransfers * results.tfrResults[0].avgBandwidthGbPerSec); + + // BMA executor next + t.exeDevice = {EXE_GPU_BDMA, exeIndex}; + for (int i = 0; i < numSesList.size(); i++) { + t.numSubExecs = numSesList[i]; + + if (!TransferBench::RunTransfers(cfg, transfers, results)) { + for (auto const& err : results.errResults) + Utils::Print("%s\n", err.errMsg.c_str()); + return 1; + } + + table.Set(currRow, 2+i, " %6.2f ", numTransfers * results.tfrResults[0].avgBandwidthGbPerSec); + } + } + + if (!ev.outputToCsv) { + Utils::Print("\n"); + } + table.PrintTable(ev.outputToCsv, ev.showBorders); + Utils::Print("Reported numbers are all GB/s, normalized for per Transfer for %d Transfers\n", numTransfers); + + return 0; +} diff --git a/src/client/Presets/Presets.hpp b/src/client/Presets/Presets.hpp index d856c6a..6b2dfd6 100644 --- a/src/client/Presets/Presets.hpp +++ b/src/client/Presets/Presets.hpp @@ -30,6 +30,7 @@ THE SOFTWARE. #include "AllToAll.hpp" #include "AllToAllN.hpp" #include "AllToAllSweep.hpp" +#include "BmaSweep.hpp" #include "GfxSweep.hpp" #include "HbmBandwidth.hpp" #include "HealthCheck.hpp" @@ -53,7 +54,8 @@ std::map> presetFuncMap = {"a2a", {AllToAllPreset, "Tests parallel transfers between all pairs of GPU devices"}}, {"a2a_n", {AllToAllRdmaPreset, "Tests parallel transfers between all pairs of GPU devices using Nearest NIC RDMA transfers"}}, {"a2asweep", {AllToAllSweepPreset, "Test GFX-based all-to-all transfers swept across different CU and GFX unroll counts"}}, - {"gfxsweep", {GfxSweepPreset, "Sweep BLOCKSIZES, UNROLLS, and NUM_SUB_EXECS for one GFX transfer (GFX_SWEEP_TRANSFER)"}}, + {"bmasweep", {BmaSweepPreset, "Test and compare batched DMA executor for multi destination copies"}}, + {"gfxsweep", {GfxSweepPreset, "Sweep over various GFX kernel options for a given GFX Transfer"}}, {"hbm", {HbmBandwidthPreset, "Tests HBM bandwidth"}}, {"healthcheck", {HealthCheckPreset, "Simple bandwidth health check (MI300X series only)"}}, {"nicrings", {NicRingsPreset, "Tests NIC rings created across identical NIC indices across ranks"}}, diff --git a/src/client/Utilities.hpp b/src/client/Utilities.hpp index 9b7cfb8..381d97c 100644 --- a/src/client/Utilities.hpp +++ b/src/client/Utilities.hpp @@ -393,12 +393,13 @@ namespace TransferBench::Utils std::string ExeTypeToStr(ExeType exeType) { switch (exeType) { - case EXE_CPU: return "CPU"; - case EXE_GPU_GFX: return "GPU"; - case EXE_GPU_DMA: return "DMA"; - case EXE_NIC: return "NIC"; - case EXE_NIC_NEAREST: return "NIC"; - default: return "N/A"; + case EXE_CPU: return "CPU"; + case EXE_GPU_GFX: return "GPU"; + case EXE_GPU_DMA: return "DMA"; + case EXE_NIC: return "NIC"; + case EXE_NIC_NEAREST: return "NIC"; + case EXE_GPU_BDMA: return "BMA"; + default: return "N/A"; } } diff --git a/src/header/TransferBench.hpp b/src/header/TransferBench.hpp index 5f535e3..11efbd9 100644 --- a/src/header/TransferBench.hpp +++ b/src/header/TransferBench.hpp @@ -78,6 +78,11 @@ THE SOFTWARE. #endif /// @endcond +// Batched DMA executor is only supported with HIP >= 7.0 +#if defined(__HIP_PLATFORM_AMD__) && defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >= 7) +#define BMA_EXEC_ENABLED +#endif + namespace TransferBench { using std::map; @@ -98,11 +103,12 @@ namespace TransferBench EXE_GPU_GFX = 1, ///< GPU kernel-based executor (subExecutor = threadblock/CU) EXE_GPU_DMA = 2, ///< GPU SDMA executor (subExecutor = not supported) EXE_NIC = 3, ///< NIC RDMA executor (subExecutor = queue pair) - EXE_NIC_NEAREST = 4 ///< NIC RDMA nearest executor (subExecutor = queue pair) + EXE_NIC_NEAREST = 4, ///< NIC RDMA nearest executor (subExecutor = queue pair) + EXE_GPU_BDMA = 5, ///< GPU Batched SDMA execttor (subExecutor = batch size) }; - char const ExeTypeStr[6] = "CGDIN"; + char const ExeTypeStr[7] = "CGDINB"; inline bool IsCpuExeType(ExeType e){ return e == EXE_CPU; } - inline bool IsGpuExeType(ExeType e){ return e == EXE_GPU_GFX || e == EXE_GPU_DMA; } + inline bool IsGpuExeType(ExeType e){ return e == EXE_GPU_GFX || e == EXE_GPU_DMA || e == EXE_GPU_BDMA; } inline bool IsNicExeType(ExeType e){ return e == EXE_NIC || e == EXE_NIC_NEAREST; } /** @@ -621,6 +627,7 @@ namespace TransferBench #define hipErrorPeerAccessAlreadyEnabled cudaErrorPeerAccessAlreadyEnabled #define hipFuncCachePreferShared cudaFuncCachePreferShared #define hipMemcpyDefault cudaMemcpyDefault + #define hipMemcpyKind cudaMemcpyKind #define hipMemcpyDeviceToHost cudaMemcpyDeviceToHost #define hipMemcpyHostToDevice cudaMemcpyHostToDevice #define hipSuccess cudaSuccess @@ -1061,7 +1068,7 @@ namespace { // Topology related struct RankTopology { - char hostname[33]; + char hostname[33]; char ppodId[16]; int64_t vpodId; @@ -2162,10 +2169,15 @@ namespace { break; } + if (t.numBytes % 4) { + errors.push_back({ERR_FATAL, "Transfer %d: numBytes must be a multiple of 4\n", t.numBytes}); + break; + } + // Each subexecutor is assigned a multiple of cfg.data.blockBytes, however this may // mean that some subexecutors might not have any work assigned to them if the amount to // transfer is small - if (t.exeDevice.exeType == EXE_GPU_GFX || t.exeDevice.exeType == EXE_CPU) { + if (t.exeDevice.exeType == EXE_GPU_GFX || t.exeDevice.exeType == EXE_CPU || t.exeDevice.exeType == EXE_GPU_BDMA) { size_t const N = t.numBytes / sizeof(float); int const targetMultiple = cfg.data.blockBytes / sizeof(float); int const maxSubExecToUse = std::min((size_t)(N + targetMultiple - 1) / targetMultiple, @@ -2250,9 +2262,15 @@ namespace { } break; case EXE_GPU_DMA: - if (t.srcs.size() != 1 || t.dsts.size() != 1) { + if (t.srcs.size() != 1) { + errors.push_back({ERR_FATAL, + "Transfer %d: DMA executor must have exactly 1 source", i}); + hasFatalError = true; + break; + } + if (t.dsts.size() < 1) { errors.push_back({ERR_FATAL, - "Transfer %d: DMA executor must have exactly 1 source and 1 destination", i}); + "Transfer %d: DMA executor must have at least 1 destination", i}); hasFatalError = true; break; } @@ -2294,19 +2312,10 @@ namespace { } } - err = System::Get().GetHsaAgent(t.dsts[0], dstAgent); - if (err.errType != ERR_NONE) { - errors.push_back(err); - if (err.errType == ERR_FATAL) { - hasFatalError = true; - break; - } - } - // Skip check of engine Id mask for self copies - if (srcAgent.handle != dstAgent.handle) { - uint32_t engineIdMask = 0; - err = hsa_amd_memory_copy_engine_status(dstAgent, srcAgent, &engineIdMask); + int numDsts = (int)t.dsts.size(); + for (int dstIdx = 0; dstIdx < numDsts; dstIdx++) { + err = System::Get().GetHsaAgent(t.dsts[dstIdx], dstAgent); if (err.errType != ERR_NONE) { errors.push_back(err); if (err.errType == ERR_FATAL) { @@ -2314,15 +2323,29 @@ namespace { break; } } - hsa_amd_sdma_engine_id_t sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << t.exeSubIndex); - if (!(sdmaEngineId & engineIdMask)) { - errors.push_back({ERR_FATAL, - "Transfer %d: DMA %d.%d does not exist or cannot copy between src/dst", - i, t.exeDevice.exeIndex, t.exeSubIndex}); - hasFatalError = true; - break; + + // Skip check of engine Id mask for self copies + if (srcAgent.handle != dstAgent.handle) { + uint32_t engineIdMask = 0; + err = hsa_amd_memory_copy_engine_status(dstAgent, srcAgent, &engineIdMask); + if (err.errType != ERR_NONE) { + errors.push_back(err); + if (err.errType == ERR_FATAL) { + hasFatalError = true; + break; + } + } + hsa_amd_sdma_engine_id_t sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << t.exeSubIndex); + if (!(sdmaEngineId & engineIdMask)) { + errors.push_back({ERR_FATAL, + "Transfer %d: DMA %d.%d does not exist or cannot copy between src/dst", + i, t.exeDevice.exeIndex, t.exeSubIndex}); + hasFatalError = true; + break; + } } } + if (hasFatalError) break; #endif } @@ -2345,6 +2368,60 @@ namespace { } } break; + case EXE_GPU_BDMA: +#ifdef BMA_EXEC_ENABLED + if (t.srcs.size() != 1) { + errors.push_back({ERR_FATAL, + "Transfer %d: BMA executor must have exactly 1 source", i}); + hasFatalError = true; + break; + } + if (t.dsts.size() < 1) { + errors.push_back({ERR_FATAL, + "Transfer %d: BMA executor must have at least 1 destination", i}); + hasFatalError = true; + break; + } + + if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numExecutors) { + errors.push_back({ERR_FATAL, + "Transfer %d: BMA index must be between 0 and %d (instead of %d) for rank %d", + i, numExecutors - 1, t.exeDevice.exeIndex, t.exeDevice.exeRank}); + hasFatalError = true; + break; + } + + if (t.exeSubIndex != -1) { + errors.push_back({ERR_FATAL, + "Transfer %d: BMA executor does not support executor subindices (SDMA engine selection)", i}); + hasFatalError = true; + break; + } + + if (!IsGpuMemType(t.srcs[0].memType) && !IsGpuMemType(t.dsts[0].memType)) { + errors.push_back({ERR_WARN, + "Transfer %d: No GPU memory for source or destination. Copy might not execute on BMA %d", + i, t.exeDevice.exeIndex}); + } else { + if (IsGpuMemType(t.srcs[0].memType)) { + if (t.srcs[0].memIndex != t.exeDevice.exeIndex) { + errors.push_back({ERR_WARN, + "Transfer %d: BMA executor may use the source memory device (%d) not (%d)", + i, t.srcs[0].memIndex, t.exeDevice.exeIndex}); + } + } else if (t.dsts[0].memIndex != t.exeDevice.exeIndex) { + errors.push_back({ERR_WARN, + "Transfer %d: BMA executor may use the destination memory device (%d) not (%d)", + i, t.dsts[0].memIndex, t.exeDevice.exeIndex}); + } + } + break; +#else + errors.push_back({ERR_FATAL, + "Transfer %d: BMA executor requires ROCm 7.0 or newer (AMD HIP with hipMemcpyBatchAsync)", i}); + hasFatalError = true; + break; +#endif case EXE_NIC: case EXE_NIC_NEAREST: #ifdef NIC_EXEC_ENABLED { @@ -2521,6 +2598,15 @@ namespace { "DMA %d copies will fallback to blit (GFX) kernels", exeDevice.exeIndex}); break; } + case EXE_GPU_BDMA: + { + if (transferCount[exeDevice] > gpuMaxHwQueues) { + errors.push_back({ERR_WARN, + "BMA %d attempting %d parallel transfers, however GPU_MAX_HW_QUEUES only set to %d", + exeDevice.exeIndex, transferCount[exeDevice], gpuMaxHwQueues}); + } + break; + } default: break; } @@ -2578,13 +2664,13 @@ namespace { // For targeted-SDMA #if !defined(__NVCC__) - hsa_agent_t dstAgent; ///< DMA destination memory agent + vector dstAgent; ///< DMA destination memory agents hsa_agent_t srcAgent; ///< DMA source memory agent hsa_signal_t signal; ///< HSA signal for completion hsa_amd_sdma_engine_id_t sdmaEngineId; ///< DMA engine ID #endif -// For IBV executor + // For IBV executor #ifdef NIC_EXEC_ENABLED int srcNicIndex; ///< SRC NIC index int dstNicIndex; ///< DST NIC index @@ -2612,6 +2698,13 @@ namespace { vector>sendWorkRequests; ///< Send work requests per queue pair #endif + // For BMA executor +#ifdef BMA_EXEC_ENABLED + vector batchDsts; + vector batchSrcs; + vector batchBytes; +#endif + // Counters double totalDurationMsec; ///< Total duration for all iterations for this Transfer vector perIterMsec; ///< Duration for each individual iteration @@ -3791,6 +3884,21 @@ static bool IsConfiguredGid(union ibv_gid const& gid) } } +#ifdef BMA_EXEC_ENABLED + // Prepare src/dst pointers for batched DMA executor + rss.batchDsts.clear(); + rss.batchSrcs.clear(); + rss.batchBytes.clear(); + + for (int i = 0; i < transfer.numSubExecs; ++i) { + for (int j = 0; j < (int)rss.dstMem.size(); j++) { + rss.batchSrcs.push_back(subExecParam[i].src[0]); + rss.batchDsts.push_back(subExecParam[i].dst[j]); + rss.batchBytes.push_back(subExecParam[i].N * sizeof(float)); + } + } +#endif + // Clear counters rss.totalDurationMsec = 0.0; @@ -3926,8 +4034,12 @@ static bool IsConfiguredGid(union ibv_gid const& gid) // Collect HSA agent information hsa_amd_pointer_info_t info; info.size = sizeof(info); - ERR_CHECK(hsa_amd_pointer_info(rss.dstMem[0], &info, NULL, NULL, NULL)); - rss.dstAgent = info.agentOwner; + int numDst = (int)rss.dstMem.size(); + rss.dstAgent.resize(numDst); + for (int dstIdx = 0; dstIdx < numDst; dstIdx++) { + ERR_CHECK(hsa_amd_pointer_info(rss.dstMem[dstIdx], &info, NULL, NULL, NULL)); + rss.dstAgent[dstIdx] = info.agentOwner; + } ERR_CHECK(hsa_amd_pointer_info(rss.srcMem[0], &info, NULL, NULL, NULL)); rss.srcAgent = info.agentOwner; @@ -3945,11 +4057,12 @@ static bool IsConfiguredGid(union ibv_gid const& gid) } // Prepare additional requirements for GPU-based executors - if ((exeDevice.exeType == EXE_GPU_GFX || exeDevice.exeType == EXE_GPU_DMA) && exeDevice.exeRank == localRank) { + if ((exeDevice.exeType == EXE_GPU_GFX || exeDevice.exeType == EXE_GPU_DMA || exeDevice.exeType == EXE_GPU_BDMA) + && exeDevice.exeRank == localRank) { ERR_CHECK(hipSetDevice(exeDevice.exeIndex)); // Determine how many streams to use - int const numStreamsToUse = (exeDevice.exeType == EXE_GPU_DMA || + int const numStreamsToUse = (exeDevice.exeType == EXE_GPU_DMA || exeDevice.exeType == EXE_GPU_BDMA || (exeDevice.exeType == EXE_GPU_GFX && cfg.gfx.useMultiStream)) ? exeInfo.resources.size() : 1; exeInfo.streams.resize(numStreamsToUse); @@ -4147,7 +4260,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid) } // Teardown additional requirements for GPU-based executors - if ((exeDevice.exeType == EXE_GPU_GFX || exeDevice.exeType == EXE_GPU_DMA) && exeDevice.exeRank == localRank) { + if ((exeDevice.exeType == EXE_GPU_GFX || exeDevice.exeType == EXE_GPU_DMA || exeDevice.exeType == EXE_GPU_BDMA) + && exeDevice.exeRank == localRank) { for (auto stream : exeInfo.streams) ERR_CHECK(hipStreamDestroy(stream)); if (cfg.gfx.useHipEvents || cfg.dma.useHipEvents) { @@ -4893,22 +5007,33 @@ static bool IsConfiguredGid(union ibv_gid const& gid) { auto cpuStart = std::chrono::high_resolution_clock::now(); + int numDsts = (int)resources.dstMem.size(); ERR_CHECK(hipSetDevice(exeIndex)); int subIterations = 0; if (!useSubIndices && !cfg.dma.useHsaCopy) { if (cfg.dma.useHipEvents) ERR_CHECK(hipEventRecord(startEvent, stream)); + // Force the use of SDMA engine if possible +#if defined(__HIP_PLATFORM_AMD__) && defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >= 6) + hipMemcpyKind memcpyKind = hipMemcpyDeviceToDeviceNoCU; +#else + hipMemcpyKind memcpyKind = hipMemcpyDefault; +#endif + // Use DMA copy engine do { + // Queue for each output location + for (int dstIdx = 0; dstIdx < numDsts; dstIdx++) { #if defined(__NVCC__) - ERR_CHECK(cuMemcpyAsync((CUdeviceptr)resources.dstMem[0], - (CUdeviceptr)resources.srcMem[0], - resources.numBytes, stream)); + ERR_CHECK(cuMemcpyAsync((CUdeviceptr)resources.dstMem[dstIdx], + (CUdeviceptr)resources.srcMem[0], + resources.numBytes, stream)); #else - ERR_CHECK(hipMemcpyAsync(resources.dstMem[0], resources.srcMem[0], resources.numBytes, - hipMemcpyDefault, stream)); + ERR_CHECK(hipMemcpyAsync(resources.dstMem[dstIdx], resources.srcMem[0], resources.numBytes, + memcpyKind, stream)); #endif + } } while (++subIterations != cfg.general.numSubIterations); if (cfg.dma.useHipEvents) @@ -4920,20 +5045,22 @@ static bool IsConfiguredGid(union ibv_gid const& gid) #else // Use HSA async copy do { - hsa_signal_store_screlease(resources.signal, 1); - if (!useSubIndices) { - ERR_CHECK(hsa_amd_memory_async_copy(resources.dstMem[0], resources.dstAgent, - resources.srcMem[0], resources.srcAgent, - resources.numBytes, 0, NULL, - resources.signal)); - } else { - HSA_CALL(hsa_amd_memory_async_copy_on_engine(resources.dstMem[0], resources.dstAgent, - resources.srcMem[0], resources.srcAgent, - resources.numBytes, 0, NULL, - resources.signal, - resources.sdmaEngineId, true)); + hsa_signal_store_screlease(resources.signal, numDsts); + for (int dstIdx = 0; dstIdx < numDsts; dstIdx++) { + if (!useSubIndices) { + ERR_CHECK(hsa_amd_memory_async_copy(resources.dstMem[dstIdx], resources.dstAgent[dstIdx], + resources.srcMem[0], resources.srcAgent, + resources.numBytes, 0, NULL, + resources.signal)); + } else { + HSA_CALL(hsa_amd_memory_async_copy_on_engine(resources.dstMem[dstIdx], resources.dstAgent[dstIdx], + resources.srcMem[0], resources.srcAgent, + resources.numBytes, 0, NULL, + resources.signal, + resources.sdmaEngineId, true)); + } } - // Wait for SDMA transfer to complete + // Wait for SDMA transfer(s) to complete while(hsa_signal_wait_scacquire(resources.signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE) >= 1); @@ -4990,6 +5117,88 @@ static bool IsConfiguredGid(union ibv_gid const& gid) return ERR_NONE; } +// BMA Executor-related functions +//======================================================================================== +#ifdef BMA_EXEC_ENABLED + // Execute a single BMA Transfer (one hipMemcpyBatchAsync per sub-iteration; each subexecutor is one batch entry) + static ErrResult ExecuteBatchDmaTransfer(int const iteration, + int const exeIndex, + hipStream_t const stream, + hipEvent_t const startEvent, + hipEvent_t const stopEvent, + ConfigOptions const& cfg, + TransferResources& resources) + { + auto cpuStart = std::chrono::high_resolution_clock::now(); + + ERR_CHECK(hipSetDevice(exeIndex)); + + int subIterations = 0; + if (cfg.dma.useHipEvents) + ERR_CHECK(hipEventRecord(startEvent, stream)); + + size_t failIdx = 0; + do { + ERR_CHECK(hipMemcpyBatchAsync(resources.batchDsts.data(), + resources.batchSrcs.data(), + resources.batchBytes.data(), + resources.batchDsts.size(), + nullptr, nullptr, 0, &failIdx, stream)); + } while (++subIterations != cfg.general.numSubIterations); + + if (cfg.dma.useHipEvents) + ERR_CHECK(hipEventRecord(stopEvent, stream)); + ERR_CHECK(hipStreamSynchronize(stream)); + + auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; + double cpuDeltaMsec = std::chrono::duration_cast>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations; + + if (iteration >= 0) { + double deltaMsec = cpuDeltaMsec; + if (cfg.dma.useHipEvents) { + float gpuDeltaMsec; + ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent)); + deltaMsec = gpuDeltaMsec / cfg.general.numSubIterations; + } + resources.totalDurationMsec += deltaMsec; + if (cfg.general.recordPerIteration) + resources.perIterMsec.push_back(deltaMsec); + } + return ERR_NONE; + } + + static ErrResult RunBmaExecutor(int const iteration, + ConfigOptions const& cfg, + int const exeIndex, + ExeInfo& exeInfo) + { + auto cpuStart = std::chrono::high_resolution_clock::now(); + ERR_CHECK(hipSetDevice(exeIndex)); + + vector> asyncTransfers; + for (int i = 0; i < exeInfo.resources.size(); i++) { + asyncTransfers.emplace_back(std::async(std::launch::async, + ExecuteBatchDmaTransfer, + iteration, + exeIndex, + exeInfo.streams[i], + cfg.dma.useHipEvents ? exeInfo.startEvents[i] : NULL, + cfg.dma.useHipEvents ? exeInfo.stopEvents[i] : NULL, + std::cref(cfg), + std::ref(exeInfo.resources[i]))); + } + + for (auto& asyncTransfer : asyncTransfers) + ERR_CHECK(asyncTransfer.get()); + + auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; + double deltaMsec = std::chrono::duration_cast>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations; + if (iteration >= 0) + exeInfo.totalDurationMsec += deltaMsec; + return ERR_NONE; + } +#endif // BMA_EXEC_ENABLED + // Executor-related functions //======================================================================================== static ErrResult RunExecutor(int const iteration, @@ -4998,13 +5207,16 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ExeInfo& exeInfo) { switch (exeDevice.exeType) { - case EXE_CPU: return RunCpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo); - case EXE_GPU_GFX: return RunGpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo); - case EXE_GPU_DMA: return RunDmaExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo); + case EXE_CPU: return RunCpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo); + case EXE_GPU_GFX: return RunGpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo); + case EXE_GPU_DMA: return RunDmaExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo); #ifdef NIC_EXEC_ENABLED - case EXE_NIC: return RunNicExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo); + case EXE_NIC: return RunNicExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo); +#endif +#ifdef BMA_EXEC_ENABLED + case EXE_GPU_BDMA: return RunBmaExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo); #endif - default: return {ERR_FATAL, "Unsupported executor (%d)", exeDevice.exeType}; + default: return {ERR_FATAL, "Unsupported executor (%d)", exeDevice.exeType}; } } @@ -5609,7 +5821,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) result |= RecursiveWildcardTransferExpansion(wc, baseRankIndex, numBytes, numSubExecs, transfers); wc.exe.exeSubIndices[0] = -2; return result; - case EXE_GPU_GFX: case EXE_GPU_DMA: + case EXE_GPU_GFX: case EXE_GPU_DMA: case EXE_GPU_BDMA: { // Iterate over all available subindices ExeDevice exeDevice = {wc.exe.exeType, wc.exe.exeIndices[0], wc.exe.exeRanks[0], 0}; @@ -6338,6 +6550,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) if (status != hipSuccess) numGpus = 0; topo.numExecutors[EXE_GPU_GFX] = numGpus; topo.numExecutors[EXE_GPU_DMA] = numGpus; + topo.numExecutors[EXE_GPU_BDMA] = numGpus; for (int exeIndex = 0; exeIndex < numGpus; exeIndex++) { int numDeviceCUs = 0; @@ -6356,6 +6569,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) } topo.executorName[{EXE_GPU_GFX, exeIndex}] = gpuName; topo.executorName[{EXE_GPU_DMA, exeIndex}] = gpuName; + topo.executorName[{EXE_GPU_BDMA, exeIndex}] = gpuName; #if !defined(__NVCC__) hsa_agent_t gpuAgent = gpuAgents[exeIndex]; @@ -6384,8 +6598,10 @@ static bool IsConfiguredGid(union ibv_gid const& gid) #endif topo.numExecutorSubIndices[{EXE_GPU_GFX, exeIndex}] = numXccs; topo.numExecutorSubIndices[{EXE_GPU_DMA, exeIndex}] = numDmaEngines; + topo.numExecutorSubIndices[{EXE_GPU_BDMA, exeIndex}] = 0; topo.numSubExecutors[{EXE_GPU_GFX, exeIndex}] = numDeviceCUs; topo.numSubExecutors[{EXE_GPU_DMA, exeIndex}] = 1; + topo.numSubExecutors[{EXE_GPU_DMA, exeIndex}] = numDmaEngines; topo.closestCpuNumaToGpu[exeIndex] = closestNuma; topo.closestNicsToGpu[exeIndex] = {}; } @@ -6805,7 +7021,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) return {ERR_FATAL, "CPU index must be between 0 and %d inclusively", numCpus - 1}; agent = cpuAgents[exeDevice.exeIndex]; break; - case EXE_GPU_GFX: case EXE_GPU_DMA: + case EXE_GPU_GFX: case EXE_GPU_DMA: case EXE_GPU_BDMA: if (exeIndex < 0 || exeIndex >= numGpus) return {ERR_FATAL, "GPU index must be between 0 and %d inclusively", numGpus - 1}; agent = gpuAgents[exeIndex]; @@ -7164,6 +7380,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) #undef hipErrorPeerAccessAlreadyEnabled #undef hipFuncCachePreferShared #undef hipMemcpyDefault +#undef hipMemcpyKind #undef hipMemcpyDeviceToHost #undef hipMemcpyHostToDevice #undef hipSuccess From 2372321eb6d71dd963a3e18d623d078565aa7f7d Mon Sep 17 00:00:00 2001 From: Gilbert Lee Date: Fri, 10 Apr 2026 20:29:01 -0500 Subject: [PATCH 2/4] Minor fixes to Batched DMA support --- CHANGELOG.md | 2 +- examples/example.cfg | 4 +-- src/client/Presets/BmaSweep.hpp | 51 +++++++++++++++++++++++++-------- src/header/TransferBench.hpp | 26 ++++++++++------- 4 files changed, 57 insertions(+), 26 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cb79743..2a4d775 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,7 +26,7 @@ Documentation for TransferBench is available at - NIC_FILTER renamed to TB_NIC_FILTER for consistency - DUMP_LINES renamed to TB_DUMP_LINES for consistency - Dynamically size CQs for NIC transfers in high QPs case -- Switch to using hipMemcpyDevicetoDeviceNoCU instead of hipMemcpyDefault for DMA Executor if available (requires HIP >= 6.0) +- Switch to using hipMemcpyDeviceToDeviceNoCU instead of hipMemcpyDefault for DMA Executor if available (requires HIP >= 6.0) - Allow for multiple destination memory locations for DMA/Batched-DMA Transfers ## v1.66.02 diff --git a/examples/example.cfg b/examples/example.cfg index 9c5e29f..14df7e3 100644 --- a/examples/example.cfg +++ b/examples/example.cfg @@ -12,9 +12,9 @@ # Executor: SubExecutor: # 1) CPU CPU thread # 2) GPU GPU threadblock/Compute Unit (CU) -# 3) DMA N/A. (May only be used for copies (single SRC/DST) +# 3) DMA N/A. (Must have single SRC, at least one DST) # 4) NIC Queue Pair -# 5) Batched-DMA Batch size +# 5) Batched-DMA Batch item (Must have single SRC, at least one DST) # Each single line in the configuration file defines a set of Transfers (a Test) to run in parallel diff --git a/src/client/Presets/BmaSweep.hpp b/src/client/Presets/BmaSweep.hpp index cb28f82..6ef1df8 100644 --- a/src/client/Presets/BmaSweep.hpp +++ b/src/client/Presets/BmaSweep.hpp @@ -31,7 +31,7 @@ int BmaSweepPreset(EnvVars& ev, } #ifndef BMA_EXEC_ENABLED - Utils::Print("[ERROR] BMA executor requires ROCm 7.0 or newer\n"); + Utils::Print("[ERROR] BMA executor requires ROCm 7.1 or newer\n"); return 1; #endif @@ -40,9 +40,11 @@ int BmaSweepPreset(EnvVars& ev, // Collect env vars for this preset int exeIndex = EnvVars::GetEnvVar("EXE_INDEX" , 0); int localCopy = EnvVars::GetEnvVar("LOCAL_COPY" , 0); + vector gfxSesList = EnvVars::GetEnvVarArray("GFX_SUB_EXECS", {}); int gpuMemTypeIdx = EnvVars::GetEnvVar("GPU_MEM_TYPE" , 0); int numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES" , numDetectedGpus); - vector numSesList = EnvVars::GetEnvVarArray("NUM_SUB_EXECS", {1,2,4,8}); + vector bmaSesList = EnvVars::GetEnvVarArray("NUM_SUB_EXECS", {1,2,4,8}); + MemType gpuMemType = Utils::GetGpuMemType(gpuMemTypeIdx); @@ -54,9 +56,10 @@ int BmaSweepPreset(EnvVars& ev, if (!outputToCsv) printf("[BMA Sweep Related]\n"); ev.Print("EXE_INDEX" , exeIndex, "Executing on GPU %d", exeIndex); ev.Print("LOCAL_COPY" , localCopy, "%s local copy to GPU %d", localCopy ? "Including" : "Excluding", exeIndex); + ev.Print("GFX_SUB_EXECS" , gfxSesList.size(), EnvVars::ToStr(gfxSesList).c_str()); ev.Print("GPU_MEM_TYPE" , gpuMemTypeIdx, "Using %s (%s)", Utils::GetGpuMemTypeStr(gpuMemTypeIdx).c_str(), Utils::GetAllGpuMemTypeStr().c_str()); ev.Print("NUM_GPU_DEVICES", numGpuDevices, "Using %d GPUs", numGpuDevices); - ev.Print("NUM_SUB_EXECS" , numSesList.size(), EnvVars::ToStr(numSesList).c_str()); + ev.Print("NUM_SUB_EXECS" , bmaSesList.size(), EnvVars::ToStr(bmaSesList).c_str()); printf("\n"); } } @@ -66,7 +69,9 @@ int BmaSweepPreset(EnvVars& ev, return 1; } - int numTransfers = numGpuDevices - 1 + (localCopy ? 1 : 0); + int numTransfers = numGpuDevices - 1 + (localCopy ? 1 : 0); + int numBmaSubExec = (int)bmaSesList.size(); + int numGfxSubExec = (int)gfxSesList.size(); TransferBench::ConfigOptions cfg = ev.ToConfigOptions(); TransferBench::TestResults results; @@ -74,25 +79,30 @@ int BmaSweepPreset(EnvVars& ev, // Prepare table of results int minPow2Exp = 12; int maxPow2Exp = 30; - int numRows = (maxPow2Exp - minPow2Exp + 1) + 1; - int numCols = 2 + numSesList.size(); + int numRows = 1 + (bytesSpecified ? 1 : (maxPow2Exp - minPow2Exp + 1)); + int numCols = 2 + numBmaSubExec + numGfxSubExec; Utils::TableHelper table(numRows, numCols); - - Utils::Print("Performing %d simultaneous DMA Transfers from GPU %0 to other GPUs\n", numTransfers, exeIndex); + Utils::Print("Performing %d simultaneous DMA Transfers from GPU %d to other GPUs\n", numTransfers, exeIndex); // Prepare headers + table.Set(0, 0, " Bytes "); table.Set(0, 1, " DMA "); - for (int i = 0; i < numSesList.size(); i++) { - table.Set(0, 2+i, " BMA (%d) ", numSesList[i]); + for (int i = 0; i < numBmaSubExec; i++) { + table.Set(0, 2+i, " BMA(%02d) ", bmaSesList[i]); + } + for (int i = 0; i < numGfxSubExec; i++) { + table.Set(0, 2+numBmaSubExec+i, " GFX(%02d) ", gfxSesList[i]); } + table.DrawRowBorder(0); table.DrawRowBorder(1); table.DrawRowBorder(numRows); table.DrawColBorder(0); table.DrawColBorder(1); table.DrawColBorder(2); + table.DrawColBorder(2+numBmaSubExec); table.DrawColBorder(numCols); if (!ev.outputToCsv){ @@ -101,6 +111,8 @@ int BmaSweepPreset(EnvVars& ev, }; for (size_t numBytes = 1ULL<= 7.0 -#if defined(__HIP_PLATFORM_AMD__) && defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >= 7) +// Batched DMA executor is only supported with HIP >= 7.1 +#if defined(__HIP_PLATFORM_AMD__) && \ + defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >= 7) && \ + defined(HIP_VERSION_MINOR) && (HIP_VERSION_MINOR >= 1) #define BMA_EXEC_ENABLED #endif @@ -104,7 +106,7 @@ namespace TransferBench EXE_GPU_DMA = 2, ///< GPU SDMA executor (subExecutor = not supported) EXE_NIC = 3, ///< NIC RDMA executor (subExecutor = queue pair) EXE_NIC_NEAREST = 4, ///< NIC RDMA nearest executor (subExecutor = queue pair) - EXE_GPU_BDMA = 5, ///< GPU Batched SDMA execttor (subExecutor = batch size) + EXE_GPU_BDMA = 5, ///< GPU Batched SDMA executor (subExecutor = batch item) }; char const ExeTypeStr[7] = "CGDINB"; inline bool IsCpuExeType(ExeType e){ return e == EXE_CPU; } @@ -2170,7 +2172,7 @@ namespace { } if (t.numBytes % 4) { - errors.push_back({ERR_FATAL, "Transfer %d: numBytes must be a multiple of 4\n", t.numBytes}); + errors.push_back({ERR_FATAL, "Transfer %d: numBytes (%lu) must be a multiple of 4\n", i, t.numBytes}); break; } @@ -2418,7 +2420,7 @@ namespace { break; #else errors.push_back({ERR_FATAL, - "Transfer %d: BMA executor requires ROCm 7.0 or newer (AMD HIP with hipMemcpyBatchAsync)", i}); + "Transfer %d: BMA executor requires ROCm 7.1 or newer (AMD HIP with hipMemcpyBatchAsync)", i}); hasFatalError = true; break; #endif @@ -3890,11 +3892,13 @@ static bool IsConfiguredGid(union ibv_gid const& gid) rss.batchSrcs.clear(); rss.batchBytes.clear(); - for (int i = 0; i < transfer.numSubExecs; ++i) { - for (int j = 0; j < (int)rss.dstMem.size(); j++) { - rss.batchSrcs.push_back(subExecParam[i].src[0]); - rss.batchDsts.push_back(subExecParam[i].dst[j]); - rss.batchBytes.push_back(subExecParam[i].N * sizeof(float)); + if (transfer.exeDevice.exeType == EXE_GPU_BMDA) { + for (int i = 0; i < transfer.numSubExecs; ++i) { + for (int j = 0; j < (int)rss.dstMem.size(); j++) { + rss.batchSrcs.push_back(subExecParam[i].src[0]); + rss.batchDsts.push_back(subExecParam[i].dst[j]); + rss.batchBytes.push_back(subExecParam[i].N * sizeof(float)); + } } } #endif @@ -6601,7 +6605,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) topo.numExecutorSubIndices[{EXE_GPU_BDMA, exeIndex}] = 0; topo.numSubExecutors[{EXE_GPU_GFX, exeIndex}] = numDeviceCUs; topo.numSubExecutors[{EXE_GPU_DMA, exeIndex}] = 1; - topo.numSubExecutors[{EXE_GPU_DMA, exeIndex}] = numDmaEngines; + topo.numSubExecutors[{EXE_GPU_BDMA, exeIndex}] = numDmaEngines; topo.closestCpuNumaToGpu[exeIndex] = closestNuma; topo.closestNicsToGpu[exeIndex] = {}; } From 8bab3a2e7deefb232062e287c8c28b71a10034bf Mon Sep 17 00:00:00 2001 From: Gilbert Lee Date: Sat, 11 Apr 2026 15:59:06 -0500 Subject: [PATCH 3/4] Fixing up typos / bugs --- CHANGELOG.md | 2 +- src/header/TransferBench.hpp | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2a4d775..d0dd559 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,7 @@ Documentation for TransferBench is available at - Adding NIC_CQ_POLL_BATCH to control CQ poll batch size for NIC transfers - New "hbm" preset which sweeps and tests local HBM read performance - Added a new TB_WALLCLOCK_RATE that will override GPU GFX wallclock rate if it returns 0 (debug) -- Adding new batched-DMA executor "B", which utilizes the hipMemcpyBatchAsync API introduced in HIP 7.0 +- Adding new batched-DMA executor "B", which utilizes the hipMemcpyBatchAsync API introduced in HIP 7.1 - Added new bmasweep preset that compares DMA to batched DMA execution for parallel transfers to other GPUs ### Modified diff --git a/src/header/TransferBench.hpp b/src/header/TransferBench.hpp index 815a0b8..e7e1720 100644 --- a/src/header/TransferBench.hpp +++ b/src/header/TransferBench.hpp @@ -79,9 +79,8 @@ THE SOFTWARE. /// @endcond // Batched DMA executor is only supported with HIP >= 7.1 -#if defined(__HIP_PLATFORM_AMD__) && \ - defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >= 7) && \ - defined(HIP_VERSION_MINOR) && (HIP_VERSION_MINOR >= 1) +#if defined(__HIP_PLATFORM_AMD__) && defined(HIP_VERSION_MAJOR) && defined(HIP_VERSION_MINOR) && \ + ((HIP_VERSION_MAJOR > 7) || (HIP_VERSION_MAJOR == 7 && HIP_VERSION_MINOR >= 1)) #define BMA_EXEC_ENABLED #endif @@ -3892,7 +3891,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) rss.batchSrcs.clear(); rss.batchBytes.clear(); - if (transfer.exeDevice.exeType == EXE_GPU_BMDA) { + if (transfer.exeDevice.exeType == EXE_GPU_BDMA) { for (int i = 0; i < transfer.numSubExecs; ++i) { for (int j = 0; j < (int)rss.dstMem.size(); j++) { rss.batchSrcs.push_back(subExecParam[i].src[0]); From e55d2a58f93efb05c5192aecb1fc450a19604295 Mon Sep 17 00:00:00 2001 From: Gilbert Lee Date: Mon, 13 Apr 2026 23:49:19 -0500 Subject: [PATCH 4/4] Adding support for cudaMemcpyBatchAsync --- src/header/TransferBench.hpp | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/header/TransferBench.hpp b/src/header/TransferBench.hpp index e7e1720..aff26a5 100644 --- a/src/header/TransferBench.hpp +++ b/src/header/TransferBench.hpp @@ -78,9 +78,8 @@ THE SOFTWARE. #endif /// @endcond -// Batched DMA executor is only supported with HIP >= 7.1 -#if defined(__HIP_PLATFORM_AMD__) && defined(HIP_VERSION_MAJOR) && defined(HIP_VERSION_MINOR) && \ - ((HIP_VERSION_MAJOR > 7) || (HIP_VERSION_MAJOR == 7 && HIP_VERSION_MINOR >= 1)) +// Batched DMA executor is only supported with HIP >= 7.1 and CUDA 12.8 +#if (defined(HIP_VERSION) && (HIP_VERSION >= 710)) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12080)) #define BMA_EXEC_ENABLED #endif @@ -659,6 +658,7 @@ namespace TransferBench #define hipMallocManaged cudaMallocManaged #define hipMemcpy cudaMemcpy #define hipMemcpyAsync cudaMemcpyAsync + #define hipMemcpyBatchAsync cudaMemcpyBatchAsync #define hipMemset cudaMemset #define hipMemsetAsync cudaMemsetAsync #define hipSetDevice cudaSetDevice @@ -2701,9 +2701,9 @@ namespace { // For BMA executor #ifdef BMA_EXEC_ENABLED - vector batchDsts; - vector batchSrcs; - vector batchBytes; + vector batchDsts; ///< Destination pointers (per batch item) + vector batchSrcs; ///< Source pointers (per batch item) + vector batchBytes; ///< Bytes to copy (per batch item) #endif // Counters @@ -3890,7 +3890,6 @@ static bool IsConfiguredGid(union ibv_gid const& gid) rss.batchDsts.clear(); rss.batchSrcs.clear(); rss.batchBytes.clear(); - if (transfer.exeDevice.exeType == EXE_GPU_BDMA) { for (int i = 0; i < transfer.numSubExecs; ++i) { for (int j = 0; j < (int)rss.dstMem.size(); j++) { @@ -5146,7 +5145,11 @@ static bool IsConfiguredGid(union ibv_gid const& gid) resources.batchSrcs.data(), resources.batchBytes.data(), resources.batchDsts.size(), - nullptr, nullptr, 0, &failIdx, stream)); + nullptr, nullptr, 0, +#if !defined(__NVCC__) + &failIdx, +#endif + stream)); } while (++subIterations != cfg.general.numSubIterations); if (cfg.dma.useHipEvents)