Skip to content

Commit 8bd9fe4

Browse files
authored
Merge branch 'candidate' into BmaExecutor
2 parents 277408f + 2900b4e commit 8bd9fe4

2 files changed

Lines changed: 192 additions & 0 deletions

File tree

src/client/Presets/GfxSweep.hpp

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
/*
2+
Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
3+
4+
Permission is hereby granted, free of charge, to any person obtaining a copy
5+
of this software and associated documentation files (the "Software"), to deal
6+
in the Software without restriction, including without limitation the rights
7+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8+
copies of the Software, and to permit persons to whom the Software is
9+
furnished to do so, subject to the following conditions:
10+
11+
The above copyright notice and this permission notice shall be included in
12+
all copies or substantial portions of the Software.
13+
14+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20+
THE SOFTWARE.
21+
*/
22+
23+
#include "EnvVars.hpp"
24+
25+
namespace {
26+
27+
bool LooksLikeFullTransferLine(std::string const& spec)
28+
{
29+
size_t i = 0;
30+
while (i < spec.size() && isspace(static_cast<unsigned char>(spec[i])))
31+
++i;
32+
if (i >= spec.size())
33+
return false;
34+
if (spec[i] == '-')
35+
return i + 1 < spec.size() && isdigit(static_cast<unsigned char>(spec[i + 1]));
36+
return isdigit(static_cast<unsigned char>(spec[i])) != 0;
37+
}
38+
39+
} // namespace
40+
41+
int GfxSweepPreset(EnvVars& ev,
42+
size_t const numBytesPerTransfer,
43+
std::string const presetName,
44+
bool const bytesSpecified)
45+
{
46+
int showMinOnly = EnvVars::GetEnvVar("SHOW_MIN_ONLY", 1);
47+
int verbose = EnvVars::GetEnvVar("VERBOSE", 0);
48+
std::vector<int> blockList = EnvVars::GetEnvVarArray("BLOCKSIZES", {256});
49+
std::vector<int> unrollList = EnvVars::GetEnvVarArray("UNROLLS", {1, 2, 3, 4, 6, 8});
50+
std::vector<int> numSesList = EnvVars::GetEnvVarArray("NUM_SUB_EXECS", {4, 8, 12, 16, 24, 32});
51+
std::vector<int> wordSizeList = EnvVars::GetEnvVarArray("WORDSIZES", {4});
52+
std::vector<int> temporalList = EnvVars::GetEnvVarArray("TEMPORAL_MODES", {0});
53+
std::vector<int> waveOrderList = EnvVars::GetEnvVarArray("WAVE_ORDERS", {0});
54+
55+
std::string const spec = EnvVars::GetEnvVar("GFX_SWEEP_TRANSFER",
56+
TransferBench::GetNumRanks() > 1 ? "R0G0->R0G0->R0G0" : "G0->G0->G0");
57+
std::string const line = LooksLikeFullTransferLine(spec) ? spec : (std::string("1 1 ") + spec);
58+
59+
std::vector<TransferBench::Transfer> transfers;
60+
TransferBench::Utils::CheckForError(TransferBench::ParseTransfers(line, transfers));
61+
62+
if (transfers.size() != 1) {
63+
if (TransferBench::GetNumRanks() > 1 && transfers.size() > 1) {
64+
TransferBench::Utils::Print(
65+
"[WARN] gfxsweep: In Multinode setting, omitted rank fields on SRC/DST/EXE are filled per rank, "
66+
"and transfers without ranks specified will expand to multiple parallel copy per node. "
67+
"gfxsweep expects exactly one entry here and forbids such entries; for a local sweep use a single rank (`-np 1`), "
68+
"or adjust GFX_SWEEP_TRANSFER / rank syntax so expansion yields one transfer.\n");
69+
}
70+
TransferBench::Utils::Print(
71+
"[ERROR] gfxsweep expects exactly one transfer after parsing (got %zu). "
72+
"Set GFX_SWEEP_TRANSFER to a single SRC EXE DST triplet or one basic/advanced line that expands to one transfer.\n",
73+
transfers.size());
74+
return 1;
75+
}
76+
77+
if (transfers[0].exeDevice.exeType != TransferBench::EXE_GPU_GFX) {
78+
TransferBench::Utils::Print(
79+
"[ERROR] gfxsweep requires a GPU GFX (G) executor; parsed executor type is not GFX.\n");
80+
return 1;
81+
}
82+
83+
transfers[0].numBytes = numBytesPerTransfer;
84+
85+
if (TransferBench::Utils::RankDoesOutput()) {
86+
ev.DisplayEnvVars();
87+
if (!ev.hideEnv) {
88+
if (!ev.outputToCsv)
89+
TransferBench::Utils::Print("[GfxSweep Related]\n");
90+
ev.Print("GFX_SWEEP_TRANSFER", spec, "Transfer spec (see config file format)");
91+
ev.Print("BLOCKSIZES", blockList.size(), EnvVars::ToStr(blockList).c_str());
92+
ev.Print("NUM_SUB_EXECS", numSesList.size(), EnvVars::ToStr(numSesList).c_str());
93+
ev.Print("WORDSIZES", wordSizeList.size(), EnvVars::ToStr(wordSizeList).c_str());
94+
ev.Print("TEMPORAL_MODES", temporalList.size(), EnvVars::ToStr(temporalList).c_str());
95+
ev.Print("WAVE_ORDERS", waveOrderList.size(), EnvVars::ToStr(waveOrderList).c_str());
96+
ev.Print("SHOW_MIN_ONLY", showMinOnly, showMinOnly ? "Showing only slowest sub-executor aggregate" : "Showing slowest and fastest");
97+
ev.Print("UNROLLS", unrollList.size(), EnvVars::ToStr(unrollList).c_str());
98+
ev.Print("VERBOSE", verbose, verbose ? "Display test results" : "Display summary only");
99+
TransferBench::Utils::Print("\n");
100+
}
101+
}
102+
103+
TransferBench::Utils::Print("GFX sweep (single transfer):\n");
104+
TransferBench::Utils::Print("============================\n");
105+
TransferBench::Utils::Print("- Parsed line: %s\n", line.c_str());
106+
TransferBench::Utils::Print("- %lu bytes per transfer\n", static_cast<unsigned long>(numBytesPerTransfer));
107+
108+
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
109+
110+
using GfxSweepKey = std::tuple<int, int, int, int, int, int>; // block, wordSize, temporal, waveOrder, subExecs, unroll
111+
std::map<GfxSweepKey, TransferBench::TestResults> results;
112+
113+
for (int blockSize : blockList) {
114+
ev.gfxBlockSize = cfg.gfx.blockSize = blockSize;
115+
116+
for (int wordSize : wordSizeList) {
117+
ev.gfxWordSize = cfg.gfx.wordSize = wordSize;
118+
119+
for (int temporalMode : temporalList) {
120+
ev.gfxTemporal = cfg.gfx.temporalMode = temporalMode;
121+
122+
for (int waveOrder : waveOrderList) {
123+
ev.gfxWaveOrder = cfg.gfx.waveOrder = waveOrder;
124+
125+
TransferBench::Utils::Print("Blocksize: %d WORD_SIZE: %d TEMPORAL: %d WAVE_ORDER: %d\n",
126+
blockSize, wordSize, temporalMode, waveOrder);
127+
128+
TransferBench::Utils::Print("#CUs\\Unroll");
129+
for (int u : unrollList) {
130+
TransferBench::Utils::Print(" %d(Min) ", u);
131+
if (!showMinOnly)
132+
TransferBench::Utils::Print(" %d(Max) ", u);
133+
}
134+
TransferBench::Utils::Print("\n");
135+
136+
for (int c : numSesList) {
137+
TransferBench::Utils::Print(" %5d ", c);
138+
fflush(stdout);
139+
for (int u : unrollList) {
140+
ev.gfxUnroll = cfg.gfx.unrollFactor = u;
141+
transfers[0].numSubExecs = c;
142+
143+
double minBandwidth = std::numeric_limits<double>::max();
144+
double maxBandwidth = std::numeric_limits<double>::min();
145+
TransferBench::TestResults result;
146+
GfxSweepKey const key = std::make_tuple(blockSize, wordSize, temporalMode, waveOrder, c, u);
147+
if (TransferBench::RunTransfers(cfg, transfers, result)) {
148+
for (auto const& exeResult : result.exeResults) {
149+
minBandwidth = std::min(minBandwidth, exeResult.second.avgBandwidthGbPerSec);
150+
maxBandwidth = std::max(maxBandwidth, exeResult.second.avgBandwidthGbPerSec);
151+
}
152+
results[key] = result;
153+
} else {
154+
minBandwidth = 0.0;
155+
}
156+
TransferBench::Utils::Print(" %7.2f ", minBandwidth);
157+
if (!showMinOnly)
158+
TransferBench::Utils::Print(" %7.2f ", maxBandwidth);
159+
fflush(stdout);
160+
}
161+
TransferBench::Utils::Print("\n");
162+
fflush(stdout);
163+
}
164+
165+
if (verbose) {
166+
int testNum = 0;
167+
for (int c : numSesList) {
168+
for (int u : unrollList) {
169+
GfxSweepKey const key = std::make_tuple(blockSize, wordSize, temporalMode, waveOrder, c, u);
170+
TransferBench::Utils::Print(
171+
"Blocksize: %d WORD_SIZE: %d TEMPORAL: %d WAVE_ORDER: %d SubExecs: %d Unroll: %d\n",
172+
blockSize, wordSize, temporalMode, waveOrder, c, u);
173+
transfers[0].numSubExecs = c;
174+
auto const resultIt = results.find(key);
175+
if (resultIt != results.end()) {
176+
TransferBench::Utils::PrintResults(ev, ++testNum, transfers, resultIt->second);
177+
} else {
178+
++testNum;
179+
TransferBench::Utils::Print("No results available for this sweep point (transfer run failed).\n");
180+
}
181+
}
182+
}
183+
}
184+
}
185+
}
186+
}
187+
}
188+
189+
return 0;
190+
}

src/client/Presets/Presets.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ THE SOFTWARE.
3131
#include "AllToAllN.hpp"
3232
#include "AllToAllSweep.hpp"
3333
#include "BmaSweep.hpp"
34+
#include "GfxSweep.hpp"
3435
#include "HbmBandwidth.hpp"
3536
#include "HealthCheck.hpp"
3637
#include "NicRings.hpp"
@@ -54,6 +55,7 @@ std::map<std::string, std::pair<PresetFunc, std::string>> presetFuncMap =
5455
{"a2a_n", {AllToAllRdmaPreset, "Tests parallel transfers between all pairs of GPU devices using Nearest NIC RDMA transfers"}},
5556
{"a2asweep", {AllToAllSweepPreset, "Test GFX-based all-to-all transfers swept across different CU and GFX unroll counts"}},
5657
{"bmasweep", {BmaSweepPreset, "Test and compare batched DMA executor for multi destination copies"}},
58+
{"gfxsweep", {GfxSweepPreset, "Sweep BLOCKSIZES, UNROLLS, and NUM_SUB_EXECS for one GFX transfer (GFX_SWEEP_TRANSFER)"}},
5759
{"hbm", {HbmBandwidthPreset, "Tests HBM bandwidth"}},
5860
{"healthcheck", {HealthCheckPreset, "Simple bandwidth health check (MI300X series only)"}},
5961
{"nicrings", {NicRingsPreset, "Tests NIC rings created across identical NIC indices across ranks"}},

0 commit comments

Comments
 (0)