@@ -167,10 +167,14 @@ struct HbmBwResult
167167 double bw[3 ]; // MAX | AVG | MIN
168168};
169169
170- int HbmBandwidthPreset (EnvVars& ev,
171- size_t const numBytesPerTransfer,
172- std::string const presetName)
170+ int HbmBandwidthPreset (EnvVars& ev,
171+ size_t const numBytesPerTransfer,
172+ std::string const presetName,
173+ bool const bytesSpecified)
173174{
175+ // If bytes aren't specified, default to 1GB
176+ size_t numBytesAtLeast = (bytesSpecified ? numBytesPerTransfer : 1024 * 1024 * 1024 );
177+
174178 // Determine rank information
175179 int numRanks = TransferBench::GetNumRanks ();
176180 int myRank = TransferBench::GetRank ();
@@ -322,7 +326,7 @@ int HbmBandwidthPreset(EnvVars& ev,
322326 ev.Print (" CRITERIA" , criteria , " Reporting highest %s bandwidth (0=MAX,1=AVG,2=MIN)" , criteria == 0 ? " MAX" : criteria == 1 ? " AVG" : " MIN" );
323327 ev.Print (" ELEM_BYTES" , EnvVars::ToStr (elemBytes).c_str () , " Element sizes in bytes to sweep over (must contain only 4,8 or 16)" );
324328 ev.Print (" GPU_INDICES" , EnvVars::ToStr (gpuIndices).c_str (), " GPU indices to test. Leave empty for all" );
325- ev.Print (" MEM_TYPE" , memTypeIdx , " Using %s GPU memory (%s)" , devMemTypeStr.c_str (), Utils::GetAllGpuMemTypeStr ().c_str ());
329+ ev.Print (" MEM_TYPE" , memTypeIdx , " Using %s memory (%s)" , devMemTypeStr.c_str (), Utils::GetAllGpuMemTypeStr ().c_str ());
326330 ev.Print (" NUM_BUFFERS" , numBuffers , " Number of buffers to rotate through (1 per iteration)" );
327331 ev.Print (" NUM_ITERATIONS" , numIterations , " Number of iterations to time" );
328332 ev.Print (" NUM_SUB_EXECS" , EnvVars::ToStr (numSesList).c_str (), " Number of subexecutors to sweep over (default to all available)" );
@@ -344,16 +348,16 @@ int HbmBandwidthPreset(EnvVars& ev,
344348
345349 // Determine how how much memory to allocate based on sweep setting
346350 // During each Step each threadblock works on BLOCKSIZE * UNROLL * ELEM_BYTES bytes
347- // Each buffer will be allocated as the smallest multiple of this, larger than numBytesPerTransfer ,
351+ // Each buffer will be allocated as the smallest multiple of this, larger than numBytesAtLeast ,
348352 // NOTE: It's not safe to just base this on maximums values in each sweep parameter,
349- // (e.g if maximum size divides numBytesPerTransfer perfectly) so looping over entire space is safer
353+ // (e.g if maximum size divides numBytesAtLeast perfectly) so looping over entire space is safer
350354 size_t largestTotalBytesPerBuffer = 0 ;
351355 for (int numSubExec : numSesList) {
352356 for (int blockSize : blockSizes) {
353357 for (int unroll : unrolls) {
354358 for (int elemByte : elemBytes) {
355359 size_t totalBytesPerStep = numSubExec * blockSize * unroll * elemByte;
356- size_t numSteps = std::max ((size_t )1 , (numBytesPerTransfer + totalBytesPerStep - 1 ) / totalBytesPerStep);
360+ size_t numSteps = std::max ((size_t )1 , (numBytesAtLeast + totalBytesPerStep - 1 ) / totalBytesPerStep);
357361 size_t totalBytesPerBuffer = numSteps * totalBytesPerStep;
358362 if (totalBytesPerBuffer > largestTotalBytesPerBuffer) largestTotalBytesPerBuffer = totalBytesPerBuffer;
359363 }
@@ -372,7 +376,7 @@ int HbmBandwidthPreset(EnvVars& ev,
372376 // Calculate total number of tests that will be executed per GPU
373377 size_t numTests = numSesList.size () * blockSizes.size () * unrolls.size () * elemBytes.size () * (temporalMask == 3 ? 2 : 1 );
374378
375- Utils::Print (" Testing (%lu configs per GPU): " , numTests);
379+ Utils::Print (" Testing on at least %lu bytes (%lu configs per GPU): " , numBytesAtLeast , numTests);
376380 fflush (stdout);
377381 }
378382
@@ -439,7 +443,7 @@ int HbmBandwidthPreset(EnvVars& ev,
439443 for (int elemByte : elemBytes) {
440444 int elemByteIdx = (int )log2 (elemByte) - 2 ;
441445 size_t totalBytesPerStep = numSubExec * blockSize * unroll * elemByte;
442- size_t numSteps = std::max ((size_t )1 , (numBytesPerTransfer + totalBytesPerStep - 1 ) / totalBytesPerStep);
446+ size_t numSteps = std::max ((size_t )1 , (numBytesAtLeast + totalBytesPerStep - 1 ) / totalBytesPerStep);
443447 size_t totalBytes = numSteps * totalBytesPerStep;
444448
445449 for (int useNt = 0 ; useNt <= 1 ; useNt++) {
0 commit comments