Skip to content

Commit ac4e8a2

Browse files
Adding Batched DMA support (hipMemcpyBatchAsync), and bmasweep preset
1 parent 1ef9c51 commit ac4e8a2

6 files changed

Lines changed: 454 additions & 73 deletions

File tree

CHANGELOG.md

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,20 @@ Documentation for TransferBench is available at
1414
- Adding NIC_CQ_POLL_BATCH to control CQ poll batch size for NIC transfers
1515
- New "hbm" preset which sweeps and tests local HBM read performance
1616
- Added a new TB_WALLCLOCK_RATE that will override GPU GFX wallclock rate if it returns 0 (debug)
17+
- Adding new batched-DMA executor "B", which utilizes the hipMemcpyBatchAsync API introduced in HIP 7.0
18+
- Added new bmasweep preset that compares DMA to batched DMA execution for parallel transfers to other GPUs
1719

1820
### Modified
19-
- DMA-BUF support enablement in CMake changed to ENABLE_DMA_BUF to be more similar to other compile-time options
20-
- Adding extra information to CMake and make build methods to indicate enabled / disabled features
21-
- a2asweep preset changes from USE_FINE_GRAIN to MEM_TYPE to reflect various memory types
22-
- a2asweep preset changes from NUM_CUS to NUM_SUB_EXECS to match with a2a preset naming convention
23-
- scaling preset changes from using USE_FINE_GRAIN to CPU_MEM_TYPE and GPU_MEM_TYPE
24-
- NIC_FILTER renamed to TB_NIC_FILTER for consistency
25-
- DUMP_LINES renamed to TB_DUMP_LINES for consistency
26-
- Dynamically size CQs for NIC transfers in high QPs case
21+
- DMA-BUF support enablement in CMake changed to ENABLE_DMA_BUF to be more similar to other compile-time options
22+
- Adding extra information to CMake and make build methods to indicate enabled / disabled features
23+
- a2asweep preset changes from USE_FINE_GRAIN to MEM_TYPE to reflect various memory types
24+
- a2asweep preset changes from NUM_CUS to NUM_SUB_EXECS to match with a2a preset naming convention
25+
- scaling preset changes from using USE_FINE_GRAIN to CPU_MEM_TYPE and GPU_MEM_TYPE
26+
- NIC_FILTER renamed to TB_NIC_FILTER for consistency
27+
- DUMP_LINES renamed to TB_DUMP_LINES for consistency
28+
- Dynamically size CQs for NIC transfers in high QPs case
29+
- Switch to using hipMemcpyDevicetoDeviceNoCU instead of hipMemcpyDefault for DMA Executor if available (requires HIP >= 6.0)
30+
- Allow for multiple destination memory locations for DMA/Batched-DMA Transfers
2731

2832
## v1.66.02
2933
### Added

examples/example.cfg

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,13 @@
88
# SRC 1 -> Executor -> DST 1
99
# SRC X DST Y
1010

11-
# Three Executors are supported by TransferBench
11+
# Five Executors are supported by TransferBench
1212
# Executor: SubExecutor:
1313
# 1) CPU CPU thread
1414
# 2) GPU GPU threadblock/Compute Unit (CU)
1515
# 3) DMA N/A. (May only be used for copies (single SRC/DST)
1616
# 4) NIC Queue Pair
17+
# 5) Batched-DMA Batch size
1718

1819
# Each single line in the configuration file defines a set of Transfers (a Test) to run in parallel
1920

@@ -38,6 +39,7 @@
3839
# - C: CPU-executed (Indexed from 0 to # NUMA nodes - 1)
3940
# - G: GPU-executed (Indexed from 0 to # GPUs - 1)
4041
# - D: DMA-executor (Indexed from 0 to # GPUs - 1)
42+
# - B: Batched-DMA-executor (Indexed from 0 to # GPUs - 1)
4143
# - I#.#: NIC executor (Indexed from 0 to # NICs - 1)
4244
# - N#.#: Nearest NIC executor (Indexed from 0 to # GPUs - 1)
4345
# dstMemL : Destination memory locations (Where the data is to be written to)

src/client/Presets/BmaSweep.hpp

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
/*
2+
Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
3+
4+
Permission is hereby granted, free of charge, to any person obtaining a copy
5+
of this software and associated documentation files (the "Software"), to deal
6+
in the Software without restriction, including without limitation the rights
7+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8+
copies of the Software, and to permit persons to whom the Software is
9+
furnished to do so, subject to the following conditions:
10+
11+
The above copyright notice and this permission notice shall be included in
12+
all copies or substantial portions of the Software.
13+
14+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20+
THE SOFTWARE.
21+
*/
22+
23+
int BmaSweepPreset(EnvVars& ev,
24+
size_t const numBytesPerTransfer,
25+
std::string const presetName,
26+
bool const bytesSpecified)
27+
{
28+
if (TransferBench::GetNumRanks() > 1) {
29+
Utils::Print("[ERROR] BMA sweep preset currently not supported for multi-node\n");
30+
return 1;
31+
}
32+
33+
#ifndef BMA_EXEC_ENABLED
34+
Utils::Print("[ERROR] BMA executor requires ROCm 7.0 or newer\n");
35+
return 1;
36+
#endif
37+
38+
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
39+
40+
// Collect env vars for this preset
41+
int exeIndex = EnvVars::GetEnvVar("EXE_INDEX" , 0);
42+
int localCopy = EnvVars::GetEnvVar("LOCAL_COPY" , 0);
43+
int gpuMemTypeIdx = EnvVars::GetEnvVar("GPU_MEM_TYPE" , 0);
44+
int numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES" , numDetectedGpus);
45+
vector<int> numSesList = EnvVars::GetEnvVarArray("NUM_SUB_EXECS", {1,2,4,8});
46+
47+
MemType gpuMemType = Utils::GetGpuMemType(gpuMemTypeIdx);
48+
49+
// Display environment variables
50+
if (Utils::RankDoesOutput()) {
51+
ev.DisplayEnvVars();
52+
if (!ev.hideEnv) {
53+
int outputToCsv = ev.outputToCsv;
54+
if (!outputToCsv) printf("[BMA Sweep Related]\n");
55+
ev.Print("EXE_INDEX" , exeIndex, "Executing on GPU %d", exeIndex);
56+
ev.Print("LOCAL_COPY" , localCopy, "%s local copy to GPU %d", localCopy ? "Including" : "Excluding", exeIndex);
57+
ev.Print("GPU_MEM_TYPE" , gpuMemTypeIdx, "Using %s (%s)", Utils::GetGpuMemTypeStr(gpuMemTypeIdx).c_str(), Utils::GetAllGpuMemTypeStr().c_str());
58+
ev.Print("NUM_GPU_DEVICES", numGpuDevices, "Using %d GPUs", numGpuDevices);
59+
ev.Print("NUM_SUB_EXECS" , numSesList.size(), EnvVars::ToStr(numSesList).c_str());
60+
printf("\n");
61+
}
62+
}
63+
64+
if (exeIndex < 0 || exeIndex >= numGpuDevices) {
65+
Utils::Print("EXE_INDEX must be between 0 and %d inclusively\n", numGpuDevices - 1);
66+
return 1;
67+
}
68+
69+
int numTransfers = numGpuDevices - 1 + (localCopy ? 1 : 0);
70+
71+
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
72+
TransferBench::TestResults results;
73+
74+
// Prepare table of results
75+
int minPow2Exp = 12;
76+
int maxPow2Exp = 30;
77+
int numRows = (maxPow2Exp - minPow2Exp + 1) + 1;
78+
int numCols = 2 + numSesList.size();
79+
80+
Utils::TableHelper table(numRows, numCols);
81+
82+
Utils::Print("Performing %d simultaneous DMA Transfers from GPU %0 to other GPUs\n", numTransfers, exeIndex);
83+
84+
// Prepare headers
85+
table.Set(0, 0, " Bytes ");
86+
table.Set(0, 1, " DMA ");
87+
for (int i = 0; i < numSesList.size(); i++) {
88+
table.Set(0, 2+i, " BMA (%d) ", numSesList[i]);
89+
}
90+
table.DrawRowBorder(0);
91+
table.DrawRowBorder(1);
92+
table.DrawRowBorder(numRows);
93+
table.DrawColBorder(0);
94+
table.DrawColBorder(1);
95+
table.DrawColBorder(2);
96+
table.DrawColBorder(numCols);
97+
98+
if (!ev.outputToCsv){
99+
Utils::Print("Executing: ");
100+
fflush(stdout);
101+
};
102+
103+
for (size_t numBytes = 1ULL<<minPow2Exp, currRow=1; numBytes <= (1ULL<<maxPow2Exp); numBytes<<=1, currRow++) {
104+
if (!ev.outputToCsv) {
105+
Utils::Print(".");
106+
fflush(stdout);
107+
}
108+
109+
table.Set(currRow, 0, " %lu ", numBytes);
110+
std::vector<Transfer> transfers(1);
111+
112+
Transfer& t = transfers[0];
113+
t.numBytes = numBytes;
114+
t.srcs = {{gpuMemType, exeIndex}};
115+
t.dsts.clear();
116+
for (int i = 0; i < numGpuDevices; i++) {
117+
if (i == exeIndex && localCopy == 0) continue;
118+
t.dsts.push_back({gpuMemType, i});
119+
}
120+
121+
// DMA executor first
122+
t.exeDevice = {EXE_GPU_DMA, exeIndex};
123+
t.numSubExecs = 1;
124+
125+
if (!TransferBench::RunTransfers(cfg, transfers, results)) {
126+
for (auto const& err : results.errResults)
127+
Utils::Print("%s\n", err.errMsg.c_str());
128+
return 1;
129+
}
130+
131+
table.Set(currRow, 1, " %6.2f ", numTransfers * results.tfrResults[0].avgBandwidthGbPerSec);
132+
133+
// BMA executor next
134+
t.exeDevice = {EXE_GPU_BDMA, exeIndex};
135+
for (int i = 0; i < numSesList.size(); i++) {
136+
t.numSubExecs = numSesList[i];
137+
138+
if (!TransferBench::RunTransfers(cfg, transfers, results)) {
139+
for (auto const& err : results.errResults)
140+
Utils::Print("%s\n", err.errMsg.c_str());
141+
return 1;
142+
}
143+
144+
table.Set(currRow, 2+i, " %6.2f ", numTransfers * results.tfrResults[0].avgBandwidthGbPerSec);
145+
}
146+
}
147+
148+
if (!ev.outputToCsv) {
149+
Utils::Print("\n");
150+
}
151+
table.PrintTable(ev.outputToCsv, ev.showBorders);
152+
Utils::Print("Reported numbers are all GB/s, normalized for per Transfer for %d Transfers\n", numTransfers);
153+
154+
return 0;
155+
}

src/client/Presets/Presets.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ THE SOFTWARE.
3030
#include "AllToAll.hpp"
3131
#include "AllToAllN.hpp"
3232
#include "AllToAllSweep.hpp"
33+
#include "BmaSweep.hpp"
3334
#include "HbmBandwidth.hpp"
3435
#include "HealthCheck.hpp"
3536
#include "NicRings.hpp"
@@ -52,6 +53,7 @@ std::map<std::string, std::pair<PresetFunc, std::string>> presetFuncMap =
5253
{"a2a", {AllToAllPreset, "Tests parallel transfers between all pairs of GPU devices"}},
5354
{"a2a_n", {AllToAllRdmaPreset, "Tests parallel transfers between all pairs of GPU devices using Nearest NIC RDMA transfers"}},
5455
{"a2asweep", {AllToAllSweepPreset, "Test GFX-based all-to-all transfers swept across different CU and GFX unroll counts"}},
56+
{"bmasweep", {BmaSweepPreset, "Test and compare batched DMA executor for multi destination copies"}},
5557
{"hbm", {HbmBandwidthPreset, "Tests HBM bandwidth"}},
5658
{"healthcheck", {HealthCheckPreset, "Simple bandwidth health check (MI300X series only)"}},
5759
{"nicrings", {NicRingsPreset, "Tests NIC rings created across identical NIC indices across ranks"}},

src/client/Utilities.hpp

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -393,12 +393,13 @@ namespace TransferBench::Utils
393393
std::string ExeTypeToStr(ExeType exeType)
394394
{
395395
switch (exeType) {
396-
case EXE_CPU: return "CPU";
397-
case EXE_GPU_GFX: return "GPU";
398-
case EXE_GPU_DMA: return "DMA";
399-
case EXE_NIC: return "NIC";
400-
case EXE_NIC_NEAREST: return "NIC";
401-
default: return "N/A";
396+
case EXE_CPU: return "CPU";
397+
case EXE_GPU_GFX: return "GPU";
398+
case EXE_GPU_DMA: return "DMA";
399+
case EXE_NIC: return "NIC";
400+
case EXE_NIC_NEAREST: return "NIC";
401+
case EXE_GPU_BDMA: return "BMA";
402+
default: return "N/A";
402403
}
403404
}
404405

0 commit comments

Comments
 (0)