@@ -22,6 +22,9 @@ THE SOFTWARE.
2222
2323#include " EnvVars.hpp"
2424
25+ #include < map>
26+ #include < tuple>
27+
2528namespace {
2629
2730bool LooksLikeFullTransferLine (std::string const & spec)
@@ -41,25 +44,32 @@ bool LooksLikeFullTransferLine(std::string const& spec)
4144int GfxSweepPreset (EnvVars& ev,
4245 size_t const numBytesPerTransfer,
4346 std::string const presetName,
44- [[maybe_unused]] bool const bytesSpecified)
47+ bool const bytesSpecified)
4548{
46- (void )presetName;
47-
48- ev.useSingleStream = 1 ;
49-
5049 int showMinOnly = EnvVars::GetEnvVar (" SHOW_MIN_ONLY" , 1 );
5150 int verbose = EnvVars::GetEnvVar (" VERBOSE" , 0 );
52- std::vector<int > blockList = EnvVars::GetEnvVarArray (" BLOCKSIZES" , {256 });
53- std::vector<int > unrollList = EnvVars::GetEnvVarArray (" UNROLLS" , {1 , 2 , 3 , 4 , 6 , 8 });
54- std::vector<int > numSesList = EnvVars::GetEnvVarArray (" NUM_SUB_EXECS" , {4 , 8 , 12 , 16 , 24 , 32 });
55-
56- std::string const spec = EnvVars::GetEnvVar (" GFX_SWEEP_TRANSFER" , " G0->G0->G0" );
51+ std::vector<int > blockList = EnvVars::GetEnvVarArray (" BLOCKSIZES" , {256 });
52+ std::vector<int > unrollList = EnvVars::GetEnvVarArray (" UNROLLS" , {1 , 2 , 3 , 4 , 6 , 8 });
53+ std::vector<int > numSesList = EnvVars::GetEnvVarArray (" NUM_SUB_EXECS" , {4 , 8 , 12 , 16 , 24 , 32 });
54+ std::vector<int > wordSizeList = EnvVars::GetEnvVarArray (" WORDSIZES" , {4 });
55+ std::vector<int > temporalList = EnvVars::GetEnvVarArray (" TEMPORAL_MODES" , {0 });
56+ std::vector<int > waveOrderList = EnvVars::GetEnvVarArray (" WAVE_ORDERS" , {0 });
57+
58+ std::string const spec = EnvVars::GetEnvVar (" GFX_SWEEP_TRANSFER" ,
59+ TransferBench::GetNumRanks () > 1 ? " G0->G0->G0" : " R0G0->R0G0->R0G0" );
5760 std::string const line = LooksLikeFullTransferLine (spec) ? spec : (std::string (" 1 1 " ) + spec);
5861
5962 std::vector<TransferBench::Transfer> transfers;
6063 TransferBench::Utils::CheckForError (TransferBench::ParseTransfers (line, transfers));
6164
6265 if (transfers.size () != 1 ) {
66+ if (TransferBench::GetNumRanks () > 1 && transfers.size () > 1 ) {
67+ TransferBench::Utils::Print (
68+ " [WARN] gfxsweep: In Multinode setting, omitted rank fields on SRC/DST/EXE are filled per rank, "
69+ " and transfers without ranks specified will expand to multiple parallel copy per node. "
70+ " gfxsweep expects exactly one entry here and forbid such entries; for a local sweep use a single rank (`-np 1`), "
71+ " or adjust GFX_SWEEP_TRANSFER / rank syntax so expansion yields one transfer.\n " );
72+ }
6373 TransferBench::Utils::Print (
6474 " [ERROR] gfxsweep expects exactly one transfer after parsing (got %zu). "
6575 " Set GFX_SWEEP_TRANSFER to a single SRC EXE DST triplet or one basic/advanced line that expands to one transfer.\n " ,
@@ -75,74 +85,98 @@ int GfxSweepPreset(EnvVars& ev,
7585
7686 transfers[0 ].numBytes = numBytesPerTransfer;
7787
78- ev.DisplayEnvVars ();
79- if (!ev.hideEnv ) {
80- if (!ev.outputToCsv )
81- printf (" [GfxSweep Related]\n " );
82- ev.Print (" GFX_SWEEP_TRANSFER" , spec, " Transfer spec (see config file format)" );
83- ev.Print (" BLOCKSIZES" , blockList.size (), EnvVars::ToStr (blockList).c_str ());
84- ev.Print (" NUM_SUB_EXECS" , numSesList.size (), EnvVars::ToStr (numSesList).c_str ());
85- ev.Print (" SHOW_MIN_ONLY" , showMinOnly, showMinOnly ? " Showing only slowest sub-executor aggregate" : " Showing slowest and fastest" );
86- ev.Print (" UNROLLS" , unrollList.size (), EnvVars::ToStr (unrollList).c_str ());
87- ev.Print (" VERBOSE" , verbose, verbose ? " Display test results" : " Display summary only" );
88- printf (" \n " );
88+ if (TransferBench::Utils::RankDoesOutput ()) {
89+ ev.DisplayEnvVars ();
90+ if (!ev.hideEnv ) {
91+ if (!ev.outputToCsv )
92+ TransferBench::Utils::Print (" [GfxSweep Related]\n " );
93+ ev.Print (" GFX_SWEEP_TRANSFER" , spec, " Transfer spec (see config file format)" );
94+ ev.Print (" BLOCKSIZES" , blockList.size (), EnvVars::ToStr (blockList).c_str ());
95+ ev.Print (" NUM_SUB_EXECS" , numSesList.size (), EnvVars::ToStr (numSesList).c_str ());
96+ ev.Print (" WORDSIZES" , wordSizeList.size (), EnvVars::ToStr (wordSizeList).c_str ());
97+ ev.Print (" TEMPORAL_MODES" , temporalList.size (), EnvVars::ToStr (temporalList).c_str ());
98+ ev.Print (" WAVE_ORDERS" , waveOrderList.size (), EnvVars::ToStr (waveOrderList).c_str ());
99+ ev.Print (" SHOW_MIN_ONLY" , showMinOnly, showMinOnly ? " Showing only slowest sub-executor aggregate" : " Showing slowest and fastest" );
100+ ev.Print (" UNROLLS" , unrollList.size (), EnvVars::ToStr (unrollList).c_str ());
101+ ev.Print (" VERBOSE" , verbose, verbose ? " Display test results" : " Display summary only" );
102+ TransferBench::Utils::Print (" \n " );
103+ }
89104 }
90105
91- printf (" GFX sweep (single transfer):\n " );
92- printf (" ============================\n " );
93- printf (" - Parsed line: %s\n " , line.c_str ());
94- printf (" - %lu bytes per transfer\n " , static_cast <unsigned long >(numBytesPerTransfer));
106+ TransferBench::Utils::Print (" GFX sweep (single transfer):\n " );
107+ TransferBench::Utils::Print (" ============================\n " );
108+ TransferBench::Utils::Print (" - Parsed line: %s\n " , line.c_str ());
109+ TransferBench::Utils::Print (" - %lu bytes per transfer\n " , static_cast <unsigned long >(numBytesPerTransfer));
95110
96111 TransferBench::ConfigOptions cfg = ev.ToConfigOptions ();
97112
98- std::map<std::pair<int , int >, TransferBench::TestResults> results;
113+ using GfxSweepKey = std::tuple<int , int , int , int , int , int >; // block, wordSize, temporal, waveOrder, subExecs, unroll
114+ std::map<GfxSweepKey, TransferBench::TestResults> results;
99115
100116 for (int blockSize : blockList) {
101- printf (" Blocksize: %d\n " , blockSize);
102117 ev.gfxBlockSize = cfg.gfx .blockSize = blockSize;
103118
104- printf (" #CUs\\ Unroll" );
105- for (int u : unrollList) {
106- printf (" %d(Min) " , u);
107- if (!showMinOnly)
108- printf (" %d(Max) " , u);
109- }
110- printf (" \n " );
111-
112- for (int c : numSesList) {
113- printf (" %5d " , c);
114- fflush (stdout);
115- for (int u : unrollList) {
116- ev.gfxUnroll = cfg.gfx .unrollFactor = u;
117- transfers[0 ].numSubExecs = c;
118-
119- double minBandwidth = std::numeric_limits<double >::max ();
120- double maxBandwidth = std::numeric_limits<double >::min ();
121- TransferBench::TestResults result;
122- if (TransferBench::RunTransfers (cfg, transfers, result)) {
123- for (auto const & exeResult : result.exeResults ) {
124- minBandwidth = std::min (minBandwidth, exeResult.second .avgBandwidthGbPerSec );
125- maxBandwidth = std::max (maxBandwidth, exeResult.second .avgBandwidthGbPerSec );
119+ for (int wordSize : wordSizeList) {
120+ ev.gfxWordSize = cfg.gfx .wordSize = wordSize;
121+
122+ for (int temporalMode : temporalList) {
123+ ev.gfxTemporal = cfg.gfx .temporalMode = temporalMode;
124+
125+ for (int waveOrder : waveOrderList) {
126+ ev.gfxWaveOrder = cfg.gfx .waveOrder = waveOrder;
127+
128+ TransferBench::Utils::Print (" Blocksize: %d WORD_SIZE: %d TEMPORAL: %d WAVE_ORDER: %d\n " ,
129+ blockSize, wordSize, temporalMode, waveOrder);
130+
131+ TransferBench::Utils::Print (" #CUs\\ Unroll" );
132+ for (int u : unrollList) {
133+ TransferBench::Utils::Print (" %d(Min) " , u);
134+ if (!showMinOnly)
135+ TransferBench::Utils::Print (" %d(Max) " , u);
136+ }
137+ TransferBench::Utils::Print (" \n " );
138+
139+ for (int c : numSesList) {
140+ TransferBench::Utils::Print (" %5d " , c);
141+ fflush (stdout);
142+ for (int u : unrollList) {
143+ ev.gfxUnroll = cfg.gfx .unrollFactor = u;
144+ transfers[0 ].numSubExecs = c;
145+
146+ double minBandwidth = std::numeric_limits<double >::max ();
147+ double maxBandwidth = std::numeric_limits<double >::min ();
148+ TransferBench::TestResults result;
149+ GfxSweepKey const key = std::make_tuple (blockSize, wordSize, temporalMode, waveOrder, c, u);
150+ if (TransferBench::RunTransfers (cfg, transfers, result)) {
151+ for (auto const & exeResult : result.exeResults ) {
152+ minBandwidth = std::min (minBandwidth, exeResult.second .avgBandwidthGbPerSec );
153+ maxBandwidth = std::max (maxBandwidth, exeResult.second .avgBandwidthGbPerSec );
154+ }
155+ results[key] = result;
156+ } else {
157+ minBandwidth = 0.0 ;
158+ }
159+ TransferBench::Utils::Print (" %7.2f " , minBandwidth);
160+ if (!showMinOnly)
161+ TransferBench::Utils::Print (" %7.2f " , maxBandwidth);
162+ fflush (stdout);
163+ }
164+ TransferBench::Utils::Print (" \n " );
165+ fflush (stdout);
126166 }
127- results[std::make_pair (c, u)] = result;
128- } else {
129- minBandwidth = 0.0 ;
130- }
131- printf (" %7.2f " , minBandwidth);
132- if (!showMinOnly)
133- printf (" %7.2f " , maxBandwidth);
134- fflush (stdout);
135- }
136- printf (" \n " );
137- fflush (stdout);
138- }
139167
140- if (verbose) {
141- int testNum = 0 ;
142- for (int c : numSesList) {
143- for (int u : unrollList) {
144- printf (" SubExecs: %d Unroll %d\n " , c, u);
145- TransferBench::Utils::PrintResults (ev, ++testNum, transfers, results[std::make_pair (c, u)]);
168+ if (verbose) {
169+ int testNum = 0 ;
170+ for (int c : numSesList) {
171+ for (int u : unrollList) {
172+ GfxSweepKey const key = std::make_tuple (blockSize, wordSize, temporalMode, waveOrder, c, u);
173+ TransferBench::Utils::Print (
174+ " Blocksize: %d WORD_SIZE: %d TEMPORAL: %d WAVE_ORDER: %d SubExecs: %d Unroll: %d\n " ,
175+ blockSize, wordSize, temporalMode, waveOrder, c, u);
176+ TransferBench::Utils::PrintResults (ev, ++testNum, transfers, results[key]);
177+ }
178+ }
179+ }
146180 }
147181 }
148182 }
0 commit comments