@@ -216,7 +216,7 @@ int HbmBandwidthPreset(EnvVars& ev,
216216
217217 // Check for consistency across ranks
218218 IS_UNIFORM (blockSizes, " BLOCKSIZES" );
219- IS_UNIFORM (blockSizes, " CRITERIA" );
219+ IS_UNIFORM (criteria, " CRITERIA" );
220220 IS_UNIFORM (elemBytes, " ELEM_BYTES" );
221221 // GPU_INDICES may be different per rank
222222 IS_UNIFORM (memTypeIdx, " MEM_TYPE" );
@@ -225,7 +225,7 @@ int HbmBandwidthPreset(EnvVars& ev,
225225 IS_UNIFORM (numSesList, " NUM_SUB_EXECS" );
226226 IS_UNIFORM (prewarmMsec, " PREWARM_MSEC" );
227227 IS_UNIFORM (showDetails, " SHOW_DETAILS" );
228- IS_UNIFORM (showDetails, " SHOW_EXTRA" );
228+ IS_UNIFORM (showExtra, " SHOW_EXTRA" );
229229 IS_UNIFORM (temporalMask, " TEMPORAL_MASK" );
230230 IS_UNIFORM (unrolls, " UNROLLS" );
231231 IS_UNIFORM (useWallClock, " USE_WALLCLOCK" );
@@ -262,17 +262,18 @@ int HbmBandwidthPreset(EnvVars& ev,
262262 if (!gpuIndices.empty ()) {
263263 for (auto gpuIdx : gpuIndices) {
264264 if (gpuIdx < 0 || gpuIdx >= numDetectedGpus) {
265- Utils::Print (" [ERROR] GPU_INDICES index out of range (%d) (rank %d)" , gpuIdx, myRank);
265+ Utils::Print (" [ERROR] GPU_INDICES index out of range (%d) (rank %d)\n " , gpuIdx, myRank);
266266 return 1 ;
267267 }
268268 }
269269 }
270270
271- if (numBuffers <= 1 ) {
271+ if (numBuffers < 1 ) {
272272 Utils::Print (" [ERROR] NUM_BUFFERS must be a positive number (not %d)\n " , numBuffers);
273+ return 1 ;
273274 }
274275 if (numBuffers > numIterations) {
275- Utils::Print (" [WARN] NUM_BUFFERS (%d) exceeds NUM_ITERATIONS ($ d), so some buffers will not be used\n " ,
276+ Utils::Print (" [WARN] NUM_BUFFERS (%d) exceeds NUM_ITERATIONS (% d), so some buffers will not be used\n " ,
276277 numBuffers, numIterations);
277278 numBuffers = numIterations;
278279 }
@@ -307,6 +308,7 @@ int HbmBandwidthPreset(EnvVars& ev,
307308
308309 if (unrolls.empty ()) {
309310 Utils::Print (" [ERROR] UNROLLS may not be empty" );
311+ return 1 ;
310312 }
311313 for (auto unroll : unrolls) {
312314 if (unroll != 1 && unroll != 2 && unroll != 4 && unroll != 8 && unroll != 16 ) {
@@ -383,33 +385,29 @@ int HbmBandwidthPreset(EnvVars& ev,
383385 for (int gpuIdx : gpuIndices) {
384386 HIP_CALL (hipSetDevice (gpuIdx));
385387
386- // Determine wall clock rate for this GPU
387- int wallClockRate;
388- if (useWallClock) {
389- #if defined(__NVCC__)
390- wallClockRate = 1000000 ;
391- #else
392- HIP_CALL (hipDeviceGetAttribute (&wallClockRate, hipDeviceAttributeWallClockRate, gpuIdx));
393- #endif
394- }
395-
396388 // Create streams/events for this GPU
397389 hipStream_t stream;
398390 hipEvent_t startEvent, stopEvent;
399391 HIP_CALL (hipStreamCreate (&stream));
400392 HIP_CALL (hipEventCreate (&startEvent));
401393 HIP_CALL (hipEventCreate (&stopEvent));
402394
403- // Allocate pinned host memory closest to this GPU to capture timestamps
404- long long * minStartCycle;
405- long long * maxStopCycle;
406- if (Utils::AllocateMemory ({MEM_CPU_CLOSEST, gpuIdx, myRank}, sizeof (int64_t ), (void **)&minStartCycle)) {
407- Utils::Print (" [ERROR] Unable to allocate pinned host memory on rank %d closest to GPU %d\n " , myRank, gpuIdx);
408- return 1 ;
409- }
410- if (Utils::AllocateMemory ({MEM_CPU_CLOSEST, gpuIdx, myRank}, sizeof (int64_t ), (void **)&maxStopCycle)) {
411- Utils::Print (" [ERROR] Unable to allocate pinned host memory on rank %d closest to GPU %d\n " , myRank, gpuIdx);
412- return 1 ;
395+ // Allocate pinned host memory closest to this GPU to capture timestamps (if enabled)
396+ int wallClockRate;
397+ long long * minStartCycle = nullptr ;
398+ long long * maxStopCycle = nullptr ;
399+
400+ if (useWallClock) {
401+ #if defined(__NVCC__)
402+ wallClockRate = 1000000 ;
403+ #else
404+ HIP_CALL (hipDeviceGetAttribute (&wallClockRate, hipDeviceAttributeWallClockRate, gpuIdx));
405+ #endif
406+ if (Utils::AllocateMemory ({MEM_CPU_CLOSEST, gpuIdx, myRank}, sizeof (int64_t ), (void **)&minStartCycle) ||
407+ Utils::AllocateMemory ({MEM_CPU_CLOSEST, gpuIdx, myRank}, sizeof (int64_t ), (void **)&maxStopCycle)) {
408+ Utils::Print (" [ERROR] Unable to allocate pinned host memory on rank %d closest to GPU %d\n " , myRank, gpuIdx);
409+ return 1 ;
410+ }
413411 }
414412
415413 // Allocate and initialize each GPU buffer
@@ -458,7 +456,7 @@ int HbmBandwidthPreset(EnvVars& ev,
458456 int currBufferIdx = 0 ;
459457 auto prewarmEnd = std::chrono::steady_clock::now () + std::chrono::milliseconds (prewarmMsec);
460458 do {
461- kernel<<<gridDim, blockDim, 0 , stream>>>(inputBuffers[currBufferIdx++], NULL , numSteps, minStartCycle, maxStopCycle);
459+ kernel<<<gridDim, blockDim, 0 , stream>>>(inputBuffers[currBufferIdx++], nullptr , numSteps, minStartCycle, maxStopCycle);
462460 HIP_CALL (hipStreamSynchronize (stream));
463461 if (currBufferIdx == numBuffers) currBufferIdx = 0 ;
464462 } while (std::chrono::steady_clock::now () < prewarmEnd);
@@ -473,15 +471,13 @@ int HbmBandwidthPreset(EnvVars& ev,
473471 if (!useWallClock) {
474472 HIP_CALL (hipEventRecord (startEvent, stream));
475473 }
476- kernel<<<gridDim, blockDim, 0 , stream>>>(inputBuffers[currBufferIdx++], NULL , numSteps,
477- useWallClock ? minStartCycle : NULL , maxStopCycle);
474+ kernel<<<gridDim, blockDim, 0 , stream>>>(inputBuffers[currBufferIdx++], nullptr , numSteps, minStartCycle, maxStopCycle);
478475 if (!useWallClock) {
479476 HIP_CALL (hipEventRecord (stopEvent, stream));
480477 }
481478#else
482- hipExtLaunchKernelGGL (kernel, gridDim, blockDim, 0 , stream, useWallClock ? NULL : startEvent, useWallClock ? NULL : stopEvent, 0 ,
483- inputBuffers[currBufferIdx], nullptr , numSteps,
484- useWallClock ? minStartCycle : NULL , maxStopCycle);
479+ hipExtLaunchKernelGGL (kernel, gridDim, blockDim, 0 , stream, useWallClock ? nullptr : startEvent, useWallClock ? nullptr : stopEvent, 0 ,
480+ inputBuffers[currBufferIdx++], nullptr , numSteps, minStartCycle, maxStopCycle);
485481#endif
486482 HIP_CALL (hipStreamSynchronize (stream));
487483 if (currBufferIdx == numBuffers) currBufferIdx = 0 ;
@@ -543,6 +539,14 @@ int HbmBandwidthPreset(EnvVars& ev,
543539 }
544540 }
545541
542+ if (useWallClock) {
543+ if (Utils::DeallocateMemory (MEM_CPU_CLOSEST, minStartCycle, sizeof (int64_t )) ||
544+ Utils::DeallocateMemory (MEM_CPU_CLOSEST, maxStopCycle, sizeof (int64_t ))) {
545+ Utils::Print (" [ERROR] Unable to deallocate pinned host memory on rank %d closest to GPU %d\n " , myRank, gpuIdx);
546+ return 1 ;
547+ }
548+ }
549+
546550 // Cleanup streams and events
547551 HIP_CALL (hipStreamDestroy (stream));
548552 HIP_CALL (hipEventDestroy (startEvent));
0 commit comments