Skip to content

Commit 2372321

Browse files
Minor fixes to Batched DMA support
1 parent e100737 commit 2372321

4 files changed

Lines changed: 57 additions & 26 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ Documentation for TransferBench is available at
2626
- NIC_FILTER renamed to TB_NIC_FILTER for consistency
2727
- DUMP_LINES renamed to TB_DUMP_LINES for consistency
2828
- Dynamically size CQs for NIC transfers in high QPs case
29-
- Switch to using hipMemcpyDevicetoDeviceNoCU instead of hipMemcpyDefault for DMA Executor if available (requires HIP >= 6.0)
29+
- Switch to using hipMemcpyDeviceToDeviceNoCU instead of hipMemcpyDefault for DMA Executor if available (requires HIP >= 6.0)
3030
- Allow for multiple destination memory locations for DMA/Batched-DMA Transfers
3131

3232
## v1.66.02

examples/example.cfg

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@
1212
# Executor: SubExecutor:
1313
# 1) CPU CPU thread
1414
# 2) GPU GPU threadblock/Compute Unit (CU)
15-
# 3) DMA N/A. (May only be used for copies (single SRC/DST)
15+
# 3) DMA N/A. (Must have single SRC, at least one DST)
1616
# 4) NIC Queue Pair
17-
# 5) Batched-DMA Batch size
17+
# 5) Batched-DMA Batch item (Must have single SRC, at least one DST)
1818

1919
# Each single line in the configuration file defines a set of Transfers (a Test) to run in parallel
2020

src/client/Presets/BmaSweep.hpp

Lines changed: 39 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ int BmaSweepPreset(EnvVars& ev,
3131
}
3232

3333
#ifndef BMA_EXEC_ENABLED
34-
Utils::Print("[ERROR] BMA executor requires ROCm 7.0 or newer\n");
34+
Utils::Print("[ERROR] BMA executor requires ROCm 7.1 or newer\n");
3535
return 1;
3636
#endif
3737

@@ -40,9 +40,11 @@ int BmaSweepPreset(EnvVars& ev,
4040
// Collect env vars for this preset
4141
int exeIndex = EnvVars::GetEnvVar("EXE_INDEX" , 0);
4242
int localCopy = EnvVars::GetEnvVar("LOCAL_COPY" , 0);
43+
vector<int> gfxSesList = EnvVars::GetEnvVarArray("GFX_SUB_EXECS", {});
4344
int gpuMemTypeIdx = EnvVars::GetEnvVar("GPU_MEM_TYPE" , 0);
4445
int numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES" , numDetectedGpus);
45-
vector<int> numSesList = EnvVars::GetEnvVarArray("NUM_SUB_EXECS", {1,2,4,8});
46+
vector<int> bmaSesList = EnvVars::GetEnvVarArray("NUM_SUB_EXECS", {1,2,4,8});
47+
4648

4749
MemType gpuMemType = Utils::GetGpuMemType(gpuMemTypeIdx);
4850

@@ -54,9 +56,10 @@ int BmaSweepPreset(EnvVars& ev,
5456
if (!outputToCsv) printf("[BMA Sweep Related]\n");
5557
ev.Print("EXE_INDEX" , exeIndex, "Executing on GPU %d", exeIndex);
5658
ev.Print("LOCAL_COPY" , localCopy, "%s local copy to GPU %d", localCopy ? "Including" : "Excluding", exeIndex);
59+
ev.Print("GFX_SUB_EXECS" , gfxSesList.size(), EnvVars::ToStr(gfxSesList).c_str());
5760
ev.Print("GPU_MEM_TYPE" , gpuMemTypeIdx, "Using %s (%s)", Utils::GetGpuMemTypeStr(gpuMemTypeIdx).c_str(), Utils::GetAllGpuMemTypeStr().c_str());
5861
ev.Print("NUM_GPU_DEVICES", numGpuDevices, "Using %d GPUs", numGpuDevices);
59-
ev.Print("NUM_SUB_EXECS" , numSesList.size(), EnvVars::ToStr(numSesList).c_str());
62+
ev.Print("NUM_SUB_EXECS" , bmaSesList.size(), EnvVars::ToStr(bmaSesList).c_str());
6063
printf("\n");
6164
}
6265
}
@@ -66,33 +69,40 @@ int BmaSweepPreset(EnvVars& ev,
6669
return 1;
6770
}
6871

69-
int numTransfers = numGpuDevices - 1 + (localCopy ? 1 : 0);
72+
int numTransfers = numGpuDevices - 1 + (localCopy ? 1 : 0);
73+
int numBmaSubExec = (int)bmaSesList.size();
74+
int numGfxSubExec = (int)gfxSesList.size();
7075

7176
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
7277
TransferBench::TestResults results;
7378

7479
// Prepare table of results
7580
int minPow2Exp = 12;
7681
int maxPow2Exp = 30;
77-
int numRows = (maxPow2Exp - minPow2Exp + 1) + 1;
78-
int numCols = 2 + numSesList.size();
82+
int numRows = 1 + (bytesSpecified ? 1 : (maxPow2Exp - minPow2Exp + 1));
83+
int numCols = 2 + numBmaSubExec + numGfxSubExec;
7984

8085
Utils::TableHelper table(numRows, numCols);
81-
82-
Utils::Print("Performing %d simultaneous DMA Transfers from GPU %0 to other GPUs\n", numTransfers, exeIndex);
86+
Utils::Print("Performing %d simultaneous DMA Transfers from GPU %d to other GPUs\n", numTransfers, exeIndex);
8387

8488
// Prepare headers
89+
8590
table.Set(0, 0, " Bytes ");
8691
table.Set(0, 1, " DMA ");
87-
for (int i = 0; i < numSesList.size(); i++) {
88-
table.Set(0, 2+i, " BMA (%d) ", numSesList[i]);
92+
for (int i = 0; i < numBmaSubExec; i++) {
93+
table.Set(0, 2+i, " BMA(%02d) ", bmaSesList[i]);
94+
}
95+
for (int i = 0; i < numGfxSubExec; i++) {
96+
table.Set(0, 2+numBmaSubExec+i, " GFX(%02d) ", gfxSesList[i]);
8997
}
98+
9099
table.DrawRowBorder(0);
91100
table.DrawRowBorder(1);
92101
table.DrawRowBorder(numRows);
93102
table.DrawColBorder(0);
94103
table.DrawColBorder(1);
95104
table.DrawColBorder(2);
105+
table.DrawColBorder(2+numBmaSubExec);
96106
table.DrawColBorder(numCols);
97107

98108
if (!ev.outputToCsv){
@@ -101,6 +111,8 @@ int BmaSweepPreset(EnvVars& ev,
101111
};
102112

103113
for (size_t numBytes = 1ULL<<minPow2Exp, currRow=1; numBytes <= (1ULL<<maxPow2Exp); numBytes<<=1, currRow++) {
114+
if (bytesSpecified) numBytes = numBytesPerTransfer;
115+
104116
if (!ev.outputToCsv) {
105117
Utils::Print(".");
106118
fflush(stdout);
@@ -132,8 +144,8 @@ int BmaSweepPreset(EnvVars& ev,
132144

133145
// BMA executor next
134146
t.exeDevice = {EXE_GPU_BDMA, exeIndex};
135-
for (int i = 0; i < numSesList.size(); i++) {
136-
t.numSubExecs = numSesList[i];
147+
for (int i = 0; i < numBmaSubExec; i++) {
148+
t.numSubExecs = bmaSesList[i];
137149

138150
if (!TransferBench::RunTransfers(cfg, transfers, results)) {
139151
for (auto const& err : results.errResults)
@@ -143,6 +155,21 @@ int BmaSweepPreset(EnvVars& ev,
143155

144156
table.Set(currRow, 2+i, " %6.2f ", numTransfers * results.tfrResults[0].avgBandwidthGbPerSec);
145157
}
158+
159+
// GFX executor last
160+
t.exeDevice = {EXE_GPU_GFX, exeIndex};
161+
for (int i = 0; i < numGfxSubExec; i++) {
162+
t.numSubExecs = gfxSesList[i];
163+
164+
if (!TransferBench::RunTransfers(cfg, transfers, results)) {
165+
for (auto const& err : results.errResults)
166+
Utils::Print("%s\n", err.errMsg.c_str());
167+
return 1;
168+
}
169+
170+
table.Set(currRow, 2+numBmaSubExec+i, " %6.2f ", results.tfrResults[0].avgBandwidthGbPerSec);
171+
}
172+
if (bytesSpecified) break;
146173
}
147174

148175
if (!ev.outputToCsv) {

src/header/TransferBench.hpp

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,10 @@ THE SOFTWARE.
7878
#endif
7979
/// @endcond
8080

81-
// Batched DMA executor is only supported with HIP >= 7.0
82-
#if defined(__HIP_PLATFORM_AMD__) && defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >= 7)
81+
// Batched DMA executor is only supported with HIP >= 7.1
82+
#if defined(__HIP_PLATFORM_AMD__) && \
83+
defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >= 7) && \
84+
defined(HIP_VERSION_MINOR) && (HIP_VERSION_MINOR >= 1)
8385
#define BMA_EXEC_ENABLED
8486
#endif
8587

@@ -104,7 +106,7 @@ namespace TransferBench
104106
EXE_GPU_DMA = 2, ///< GPU SDMA executor (subExecutor = not supported)
105107
EXE_NIC = 3, ///< NIC RDMA executor (subExecutor = queue pair)
106108
EXE_NIC_NEAREST = 4, ///< NIC RDMA nearest executor (subExecutor = queue pair)
107-
EXE_GPU_BDMA = 5, ///< GPU Batched SDMA execttor (subExecutor = batch size)
109+
EXE_GPU_BDMA = 5, ///< GPU Batched SDMA executor (subExecutor = batch item)
108110
};
109111
char const ExeTypeStr[7] = "CGDINB";
110112
inline bool IsCpuExeType(ExeType e){ return e == EXE_CPU; }
@@ -2170,7 +2172,7 @@ namespace {
21702172
}
21712173

21722174
if (t.numBytes % 4) {
2173-
errors.push_back({ERR_FATAL, "Transfer %d: numBytes must be a multiple of 4\n", t.numBytes});
2175+
errors.push_back({ERR_FATAL, "Transfer %d: numBytes (%lu) must be a multiple of 4\n", i, t.numBytes});
21742176
break;
21752177
}
21762178

@@ -2418,7 +2420,7 @@ namespace {
24182420
break;
24192421
#else
24202422
errors.push_back({ERR_FATAL,
2421-
"Transfer %d: BMA executor requires ROCm 7.0 or newer (AMD HIP with hipMemcpyBatchAsync)", i});
2423+
"Transfer %d: BMA executor requires ROCm 7.1 or newer (AMD HIP with hipMemcpyBatchAsync)", i});
24222424
hasFatalError = true;
24232425
break;
24242426
#endif
@@ -3890,11 +3892,13 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
38903892
rss.batchSrcs.clear();
38913893
rss.batchBytes.clear();
38923894

3893-
for (int i = 0; i < transfer.numSubExecs; ++i) {
3894-
for (int j = 0; j < (int)rss.dstMem.size(); j++) {
3895-
rss.batchSrcs.push_back(subExecParam[i].src[0]);
3896-
rss.batchDsts.push_back(subExecParam[i].dst[j]);
3897-
rss.batchBytes.push_back(subExecParam[i].N * sizeof(float));
3895+
if (transfer.exeDevice.exeType == EXE_GPU_BMDA) {
3896+
for (int i = 0; i < transfer.numSubExecs; ++i) {
3897+
for (int j = 0; j < (int)rss.dstMem.size(); j++) {
3898+
rss.batchSrcs.push_back(subExecParam[i].src[0]);
3899+
rss.batchDsts.push_back(subExecParam[i].dst[j]);
3900+
rss.batchBytes.push_back(subExecParam[i].N * sizeof(float));
3901+
}
38983902
}
38993903
}
39003904
#endif
@@ -6601,7 +6605,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
66016605
topo.numExecutorSubIndices[{EXE_GPU_BDMA, exeIndex}] = 0;
66026606
topo.numSubExecutors[{EXE_GPU_GFX, exeIndex}] = numDeviceCUs;
66036607
topo.numSubExecutors[{EXE_GPU_DMA, exeIndex}] = 1;
6604-
topo.numSubExecutors[{EXE_GPU_DMA, exeIndex}] = numDmaEngines;
6608+
topo.numSubExecutors[{EXE_GPU_BDMA, exeIndex}] = numDmaEngines;
66056609
topo.closestCpuNumaToGpu[exeIndex] = closestNuma;
66066610
topo.closestNicsToGpu[exeIndex] = {};
66076611
}

0 commit comments

Comments
 (0)