@@ -50,6 +50,7 @@ void AllToAllSweepPreset(EnvVars& ev,
5050 int useSpray = EnvVars::GetEnvVar (" USE_SPRAY" , 0 );
5151 int verbose = EnvVars::GetEnvVar (" VERBOSE" , 0 );
5252
53+ std::vector<int > blockList = EnvVars::GetEnvVarArray (" BLOCKSIZES" , {256 });
5354 std::vector<int > unrollList = EnvVars::GetEnvVarArray (" UNROLLS" , {1 ,2 ,3 ,4 ,6 ,8 });
5455 std::vector<int > numCusList = EnvVars::GetEnvVarArray (" NUM_CUS" , {4 ,8 ,12 ,16 ,24 ,32 });
5556
@@ -77,6 +78,7 @@ void AllToAllSweepPreset(EnvVars& ev,
7778 ev.Print (" A2A_MODE" , (a2aMode == A2A_CUSTOM) ? std::to_string (numSrcs) + " :" + std::to_string (numDsts) : std::to_string (a2aMode),
7879 (a2aMode == A2A_CUSTOM) ? (std::to_string (numSrcs) + " read(s) " +
7980 std::to_string (numDsts) + " write(s)" ).c_str (): a2aModeStr[a2aMode]);
81+ ev.Print (" BLOCKSIZES" , blockList.size () , EnvVars::ToStr (blockList).c_str ());
8082 ev.Print (" SHOW_MIN_ONLY" , showMinOnly , showMinOnly ? " Showing only slowest GPU results" : " Showing slowest and fastest GPU results" );
8183 ev.Print (" NUM_CUS" , numCusList.size (), EnvVars::ToStr (numCusList).c_str ());
8284 ev.Print (" NUM_GPU_DEVICES" , numGpus , " Using %d GPUs" , numGpus);
@@ -180,48 +182,53 @@ void AllToAllSweepPreset(EnvVars& ev,
180182 std::map<std::pair<int , int >, TransferBench::TestResults> results;
181183
182184 // Display summary
183- printf (" #CUs\\ Unroll" );
184- for (int u : unrollList) {
185- printf (" %d(Min) " , u);
186- if (!showMinOnly) printf (" %d(Max) " , u);
187- }
188- printf (" \n " );
189- for (int c : numCusList) {
190- printf (" %5d " , c); fflush (stdout);
185+ for (int blockSize : blockList) {
186+ printf (" Blocksize: %d\n " , blockSize);
187+ ev.gfxBlockSize = cfg.gfx .blockSize = blockSize;
188+
189+ printf (" #CUs\\ Unroll" );
191190 for (int u : unrollList) {
192- ev.gfxUnroll = cfg.gfx .unrollFactor = u;
193- for (auto & transfer : transfers)
194- transfer.numSubExecs = useSpray ? (c * targetCount) : c;
195-
196- double minBandwidth = std::numeric_limits<double >::max ();
197- double maxBandwidth = std::numeric_limits<double >::min ();
198- TransferBench::TestResults result;
199- if (TransferBench::RunTransfers (cfg, transfers, result)) {
200- for (auto const & exeResult : result.exeResults ) {
201- minBandwidth = std::min (minBandwidth, exeResult.second .avgBandwidthGbPerSec );
202- maxBandwidth = std::max (maxBandwidth, exeResult.second .avgBandwidthGbPerSec );
203- }
204- if (useSpray) {
205- minBandwidth *= targetCount;
206- maxBandwidth *= targetCount;
207- }
208- results[std::make_pair (c,u)] = result;
209- } else {
210- minBandwidth = 0.0 ;
211- }
212- printf (" %7.2f " , minBandwidth);
213- if (!showMinOnly) printf (" %7.2f " , maxBandwidth);
214- fflush (stdout);
191+ printf (" %d(Min) " , u);
192+ if (!showMinOnly) printf (" %d(Max) " , u);
215193 }
216- printf (" \n " ); fflush (stdout);
217- }
218-
219- if (verbose) {
220- int testNum = 0 ;
194+ printf (" \n " );
221195 for (int c : numCusList) {
196+ printf (" %5d " , c); fflush (stdout);
222197 for (int u : unrollList) {
223- printf (" CUs: %d Unroll %d\n " , c, u);
224- PrintResults (ev, ++testNum, transfers, results[std::make_pair (c,u)]);
198+ ev.gfxUnroll = cfg.gfx .unrollFactor = u;
199+ for (auto & transfer : transfers)
200+ transfer.numSubExecs = useSpray ? (c * targetCount) : c;
201+
202+ double minBandwidth = std::numeric_limits<double >::max ();
203+ double maxBandwidth = std::numeric_limits<double >::min ();
204+ TransferBench::TestResults result;
205+ if (TransferBench::RunTransfers (cfg, transfers, result)) {
206+ for (auto const & exeResult : result.exeResults ) {
207+ minBandwidth = std::min (minBandwidth, exeResult.second .avgBandwidthGbPerSec );
208+ maxBandwidth = std::max (maxBandwidth, exeResult.second .avgBandwidthGbPerSec );
209+ }
210+ if (useSpray) {
211+ minBandwidth *= targetCount;
212+ maxBandwidth *= targetCount;
213+ }
214+ results[std::make_pair (c,u)] = result;
215+ } else {
216+ minBandwidth = 0.0 ;
217+ }
218+ printf (" %7.2f " , minBandwidth);
219+ if (!showMinOnly) printf (" %7.2f " , maxBandwidth);
220+ fflush (stdout);
221+ }
222+ printf (" \n " ); fflush (stdout);
223+ }
224+
225+ if (verbose) {
226+ int testNum = 0 ;
227+ for (int c : numCusList) {
228+ for (int u : unrollList) {
229+ printf (" CUs: %d Unroll %d\n " , c, u);
230+ PrintResults (ev, ++testNum, transfers, results[std::make_pair (c,u)]);
231+ }
225232 }
226233 }
227234 }
0 commit comments