Skip to content
Open
Show file tree
Hide file tree
Changes from 35 commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
741ee98
add nvbench kernel launch
WenqingLan1 Jul 22, 2025
0ae7864
submodule update
WenqingLan1 Jul 22, 2025
35bfb61
init sleep kernel
WenqingLan1 Jul 30, 2025
66b4786
Merge branch 'microsoft:main' into feat/third_party/nvbench
WenqingLan1 Aug 25, 2025
82aed0c
Merge branch 'microsoft:main' into feat/third_party/nvbench
WenqingLan1 Sep 22, 2025
24ee0a5
Merge branch 'microsoft:main' into feat/third_party/nvbench
WenqingLan1 Oct 8, 2025
bd87f50
test sleep kernel
WenqingLan1 Oct 8, 2025
a663db6
add sm 103
WenqingLan1 Oct 8, 2025
32fe197
add arg parsing logic
WenqingLan1 Oct 8, 2025
76562dc
Merge branch 'microsoft:main' into feat/third_party/nvbench
WenqingLan1 Oct 8, 2025
3eb5525
add arg parsing tests
WenqingLan1 Oct 9, 2025
4785fe6
refactor
WenqingLan1 Oct 9, 2025
1fb7c05
refine logic - remove gpu_id
WenqingLan1 Oct 9, 2025
83c442c
add doc
WenqingLan1 Oct 9, 2025
4b274c4
refine regex & update nvbench submodule
WenqingLan1 Oct 9, 2025
0cf48bb
update cmake
WenqingLan1 Oct 10, 2025
5905647
fix lint
WenqingLan1 Oct 10, 2025
baa57c9
fix lint
WenqingLan1 Oct 10, 2025
ecce2d9
fix import
WenqingLan1 Oct 10, 2025
3a58ead
fix
WenqingLan1 Oct 10, 2025
d0d8773
fix
WenqingLan1 Oct 10, 2025
fbb5969
fix
WenqingLan1 Oct 10, 2025
f007745
fix
WenqingLan1 Oct 10, 2025
b6b6082
fix
WenqingLan1 Oct 10, 2025
0f2c838
fix
WenqingLan1 Oct 10, 2025
5bd20f6
fix
WenqingLan1 Oct 10, 2025
ab88d25
fix pipeline
WenqingLan1 Oct 10, 2025
3faaf60
fix cmake
WenqingLan1 Oct 13, 2025
896a46a
fix pipeline
WenqingLan1 Oct 14, 2025
5d4986b
fix pipeline
WenqingLan1 Oct 14, 2025
b246522
fix pipeline & mlc version
WenqingLan1 Oct 14, 2025
ffe182e
Merge branch 'microsoft:main' into feat/third_party/nvbench
WenqingLan1 Dec 17, 2025
2877feb
Merge branch 'main' into feat/third_party/nvbench
WenqingLan1 Dec 22, 2025
0902eef
Merge branch 'microsoft:main' into feat/third_party/nvbench
WenqingLan1 Feb 3, 2026
498d551
Merge branch 'microsoft:main' into feat/third_party/nvbench
WenqingLan1 Feb 6, 2026
0804c12
fix comments
WenqingLan1 Feb 6, 2026
c1d1e43
add auto throughput benchmark
WenqingLan1 Feb 18, 2026
c34591d
refined logic & fix bug
WenqingLan1 Feb 20, 2026
68f5c7d
add comment to clarify diff between nvbench-kernel-launch and kernel-…
WenqingLan1 Feb 26, 2026
0bde332
resolve comments
WenqingLan1 Mar 10, 2026
7c456cf
fix lint
WenqingLan1 Mar 10, 2026
9643150
fix pipeline & resolve comments
WenqingLan1 Mar 10, 2026
f1a3b6d
fix lint
WenqingLan1 Mar 10, 2026
fe48e35
fix test
WenqingLan1 Mar 10, 2026
e1e12d2
Merge branch 'microsoft:main' into feat/third_party/nvbench
WenqingLan1 Apr 8, 2026
6fc5afb
Merge branch 'microsoft:main' into feat/third_party/nvbench
WenqingLan1 Apr 22, 2026
e253b85
resolve comments
WenqingLan1 Apr 22, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .github/workflows/codeql-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,11 @@ jobs:
- name: Install Dependency
run: |
DEBIAN_FRONTEND=noninteractive apt-get update
DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev sudo
DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev sudo build-essential
- name: Setup CMake
uses: lukka/get-cmake@latest
Copy link

Copilot AI Jan 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using @latest for third-party GitHub Actions is a supply-chain risk and can lead to non-reproducible CI behavior. Pin this action to a specific tagged version or commit SHA.

Suggested change
uses: lukka/get-cmake@latest
uses: lukka/get-cmake@v3.20.0

Copilot uses AI. Check for mistakes.
with:
cmakeVersion: '3.20.0'
- name: Initialize CodeQL
uses: github/codeql-action/init@v3
with:
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,9 @@ cython_debug/
*.userosscache
*.sln.docstates

# Build temporary files
compile_commands.json

# Build results
[Dd]ebug/
[Dd]ebugPublic/
Expand Down
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,6 @@
[submodule "third_party/nvbandwidth"]
path = third_party/nvbandwidth
url = https://github.com/NVIDIA/nvbandwidth.git
[submodule "third_party/nvbench"]
path = third_party/nvbench
Comment thread
WenqingLan1 marked this conversation as resolved.
url = https://github.com/NVIDIA/nvbench.git
23 changes: 22 additions & 1 deletion dockerfile/cuda12.8.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,27 @@ RUN apt-get update && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/*

# Install CMake 3.30.4 for nvbench compatibility
RUN apt-get update && \
apt-get remove -y cmake cmake-data && \
apt-get autoremove -y && \
cd /tmp && \
ARCH=$(uname -m) && \
case ${ARCH} in \
"aarch64") CMAKE_ARCH="aarch64" ;; \
"x86_64") CMAKE_ARCH="x86_64" ;; \
"arm64") CMAKE_ARCH="aarch64" ;; \
*) CMAKE_ARCH="x86_64" ;; \
esac && \
echo "Detected architecture: ${ARCH}, using CMAKE_ARCH: ${CMAKE_ARCH}" && \
wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
tar -xzf cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
mv cmake-3.30.4-linux-${CMAKE_ARCH} /opt/cmake && \
ln -sf /opt/cmake/bin/* /usr/local/bin/ && \
rm -rf cmake-3.30.4-linux-${CMAKE_ARCH}* && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

ARG NUM_MAKE_JOBS=
ARG TARGETPLATFORM
ARG TARGETARCH
Expand Down Expand Up @@ -161,7 +182,7 @@ ADD dockerfile/etc /opt/microsoft/
WORKDIR ${SB_HOME}

ADD third_party third_party
RUN make -C third_party cuda_with_msccl
RUN make -C third_party cuda_with_msccl cuda_nvbench

ADD . .
RUN python3 -m pip install --upgrade setuptools==70.3.0 && \
Expand Down
24 changes: 23 additions & 1 deletion dockerfile/cuda12.9.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,28 @@ RUN apt-get update && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/*

# Install CMake 3.30.4 for nvbench compatibility
RUN apt-get update && \
apt-get remove -y cmake cmake-data && \
apt-get autoremove -y && \
cd /tmp && \
ARCH=$(uname -m) && \
case ${ARCH} in \
"aarch64") CMAKE_ARCH="aarch64" ;; \
"x86_64") CMAKE_ARCH="x86_64" ;; \
"arm64") CMAKE_ARCH="aarch64" ;; \
*) CMAKE_ARCH="x86_64" ;; \
esac && \
echo "Detected architecture: ${ARCH}, using CMAKE_ARCH: ${CMAKE_ARCH}" && \
wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
tar -xzf cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
mv cmake-3.30.4-linux-${CMAKE_ARCH} /opt/cmake && \
ln -sf /opt/cmake/bin/* /usr/local/bin/ && \
rm -rf cmake-3.30.4-linux-${CMAKE_ARCH}* && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
Comment thread
WenqingLan1 marked this conversation as resolved.


Comment thread
WenqingLan1 marked this conversation as resolved.
ARG NUM_MAKE_JOBS=
ARG TARGETPLATFORM
ARG TARGETARCH
Expand Down Expand Up @@ -162,7 +184,7 @@ ADD dockerfile/etc /opt/microsoft/
WORKDIR ${SB_HOME}

ADD third_party third_party
RUN make -C third_party cuda_with_msccl
RUN make -C third_party cuda_with_msccl cuda_nvbench

ADD . .
RUN python3 -m pip install --upgrade setuptools==78.1.0 && \
Expand Down
23 changes: 22 additions & 1 deletion dockerfile/cuda13.0.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,27 @@ RUN apt-get update && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/*

# Install CMake 3.30.4 for nvbench compatibility
Comment thread
WenqingLan1 marked this conversation as resolved.
Outdated
RUN apt-get update && \
apt-get remove -y cmake cmake-data && \
apt-get autoremove -y && \
cd /tmp && \
ARCH=$(uname -m) && \
case ${ARCH} in \
"aarch64") CMAKE_ARCH="aarch64" ;; \
"x86_64") CMAKE_ARCH="x86_64" ;; \
"arm64") CMAKE_ARCH="aarch64" ;; \
*) CMAKE_ARCH="x86_64" ;; \
esac && \
echo "Detected architecture: ${ARCH}, using CMAKE_ARCH: ${CMAKE_ARCH}" && \
wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
tar -xzf cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
mv cmake-3.30.4-linux-${CMAKE_ARCH} /opt/cmake && \
ln -sf /opt/cmake/bin/* /usr/local/bin/ && \
rm -rf cmake-3.30.4-linux-${CMAKE_ARCH}* && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

ARG NUM_MAKE_JOBS=
ARG TARGETPLATFORM
ARG TARGETARCH
Expand Down Expand Up @@ -151,7 +172,7 @@ ADD dockerfile/etc /opt/microsoft/
WORKDIR ${SB_HOME}

Comment thread
WenqingLan1 marked this conversation as resolved.
ADD third_party third_party
RUN make -C third_party cuda
RUN make -C third_party cuda cuda_nvbench

ADD . .
RUN python3 -m pip install --upgrade setuptools==78.1.0 && \
Expand Down
2 changes: 1 addition & 1 deletion dockerfile/rocm5.0.x.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ RUN cd /tmp && \

# Install Intel MLC
RUN cd /tmp && \
wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
Comment thread
WenqingLan1 marked this conversation as resolved.
tar xzf mlc.tgz Linux/mlc && \
cp ./Linux/mlc /usr/local/bin/ && \
rm -rf ./Linux mlc.tgz
Expand Down
45 changes: 45 additions & 0 deletions docs/user-tutorial/benchmarks/micro-benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,51 @@ Supports the use of double unit types and the use of tensor cores.
| gpu-burn/gpu_[0-9]_pass | yes/no | The result of the gpu-burn test for each GPU (1: yes, 0: no). |
| gpu-burn/abort | yes/no | Whether or not GPU-burn test aborted before returning GPU results (1: yes, 0: no). |

### `nvbench-sleep-kernel`

#### Introduction

Measure GPU kernel execution time using NVBench's sleep kernel benchmark. This benchmark creates CUDA kernels that sleep for specified durations (in microseconds) and measures the actual execution time, providing insights into GPU scheduling overhead and timing accuracy.

The benchmark supports multiple duration specification formats:
- Single value: `"50"` - Test single duration of 50μs
- List format: `"[25,50,75]"` - Test multiple specific durations
- Range format: `"[25:75]"` - Test all values from 25μs to 75μs
- Range with step: `"[0:50:10]"` - Test from 0μs to 50μs in steps of 10μs

Performed by [NVBench](https://github.com/NVIDIA/nvbench) sleep kernel benchmark.

#### Metrics

| Name | Unit | Description |
|-----------------------------------------|-----------|-------------------------------------------------------|
| nvbench-sleep-kernel/duration_us_{X}_cpu_time | time (μs) | CPU-measured time for duration X microseconds. |
| nvbench-sleep-kernel/duration_us_{X}_gpu_time | time (μs) | GPU-measured time for duration X microseconds. |
| nvbench-sleep-kernel/duration_us_{X}_batch_gpu_time | time (μs) | GPU batch execution time for duration X microseconds. |

Where `{X}` is the sleep duration in microseconds (e.g., 25, 50, 75).

### `nvbench-kernel-launch`

#### Introduction

Measure GPU kernel launch overhead and execution time using NVBench's kernel launch benchmark. This benchmark evaluates the time required to launch kernels on the GPU and measures both CPU-side and GPU-side timing for kernel execution.

The benchmark provides insights into:
- Kernel launch latency
- CPU/GPU synchronization overhead
- Batch execution performance

Performed by [NVBench](https://github.com/NVIDIA/nvbench) kernel launch benchmark.

#### Metrics

| Name | Unit | Description |
|-------------------------------------|-----------|------------------------------------------------|
| nvbench-kernel-launch/cpu_time | time (μs) | CPU-measured kernel execution time. |
| nvbench-kernel-launch/gpu_time | time (μs) | GPU-measured kernel execution time. |
| nvbench-kernel-launch/batch_gpu_time | time (μs) | GPU batch execution time. |

### `cpu-hpl`

#### Introduction
Expand Down
30 changes: 30 additions & 0 deletions examples/benchmarks/nvbench_kernel_launch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""Example of NVBench Kernel Launch benchmark."""

from superbench.benchmarks import BenchmarkRegistry, Platform
from superbench.common.utils import logger

if __name__ == '__main__':
context = BenchmarkRegistry.create_benchmark_context(
'nvbench-kernel-launch',
platform=Platform.CUDA,
parameters=(
'--timeout 30 '
'--min-samples 10 '
'--min-time 1.0 '
'--max-noise 0.1 '
'--stopping-criterion stdrel '
'--throttle-threshold 80 '
'--throttle-recovery-delay 1.0'
)
)

benchmark = BenchmarkRegistry.launch_benchmark(context)
if benchmark:
logger.info(
'benchmark: {}, return code: {}, result: {}'.format(
benchmark.name, benchmark.return_code, benchmark.result
)
)
Comment thread
WenqingLan1 marked this conversation as resolved.
28 changes: 28 additions & 0 deletions examples/benchmarks/nvbench_sleep_kernel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""Example of NVBench Sleep Kernel benchmark."""

from superbench.benchmarks import BenchmarkRegistry, Platform
from superbench.common.utils import logger


def main():
"""Main method to run the nvbench sleep kernel benchmark."""
context = BenchmarkRegistry.create_benchmark_context(
'nvbench-sleep-kernel', platform=Platform.CUDA, parameters='--duration_us "[25,50,75]" --timeout 10'
)

benchmark = BenchmarkRegistry.launch_benchmark(context)
if benchmark:
logger.info(
'benchmark: {}, return code: {}, result: {}'.format(
benchmark.name, benchmark.return_code, benchmark.result
)
)
else:
logger.error('benchmark: nvbench-sleep-kernel launch failed.')


if __name__ == '__main__':
main()
46 changes: 10 additions & 36 deletions superbench/benchmarks/micro_benchmarks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,42 +39,16 @@
from superbench.benchmarks.micro_benchmarks.directx_mem_bw_performance import DirectXGPUMemBw
from superbench.benchmarks.micro_benchmarks.directx_gemm_flops_performance import DirectXGPUCoreFlops
from superbench.benchmarks.micro_benchmarks.nvbandwidth import NvBandwidthBenchmark
from superbench.benchmarks.micro_benchmarks.nvbench_kernel_launch import NvbenchKernelLaunch
from superbench.benchmarks.micro_benchmarks.nvbench_sleep_kernel import NvbenchSleepKernel

__all__ = [
'BlasLtBaseBenchmark',
'ComputationCommunicationOverlap',
'CpuMemBwLatencyBenchmark',
'CpuHplBenchmark',
'CpuStreamBenchmark',
'CublasBenchmark',
'CublasLtBenchmark',
'CudaGemmFlopsBenchmark',
'CudaMemBwBenchmark',
'CudaNcclBwBenchmark',
'CudnnBenchmark',
'DiskBenchmark',
'DistInference',
'HipBlasLtBenchmark',
'GPCNetBenchmark',
'GemmFlopsBenchmark',
'GpuBurnBenchmark',
'GpuCopyBwBenchmark',
'GpuStreamBenchmark',
'IBBenchmark',
'IBLoopbackBenchmark',
'KernelLaunch',
'MemBwBenchmark',
'MicroBenchmark',
'MicroBenchmarkWithInvoke',
'ORTInferenceBenchmark',
'RocmGemmFlopsBenchmark',
'RocmMemBwBenchmark',
'ShardingMatmul',
'TCPConnectivityBenchmark',
'TensorRTInferenceBenchmark',
'DirectXGPUEncodingLatency',
'DirectXGPUCopyBw',
'DirectXGPUMemBw',
'DirectXGPUCoreFlops',
'NvBandwidthBenchmark',
Comment thread
WenqingLan1 marked this conversation as resolved.
'BlasLtBaseBenchmark', 'ComputationCommunicationOverlap', 'CpuMemBwLatencyBenchmark', 'CpuHplBenchmark',
'CpuStreamBenchmark', 'CublasBenchmark', 'CublasLtBenchmark', 'CudaGemmFlopsBenchmark', 'CudaMemBwBenchmark',
'CudaNcclBwBenchmark', 'CudnnBenchmark', 'DiskBenchmark', 'DistInference', 'HipBlasLtBenchmark', 'GPCNetBenchmark',
'GemmFlopsBenchmark', 'GpuBurnBenchmark', 'GpuCopyBwBenchmark', 'GpuStreamBenchmark', 'IBBenchmark',
'IBLoopbackBenchmark', 'KernelLaunch', 'MemBwBenchmark', 'MicroBenchmark', 'MicroBenchmarkWithInvoke',
'ORTInferenceBenchmark', 'RocmGemmFlopsBenchmark', 'RocmMemBwBenchmark', 'ShardingMatmul',
'TCPConnectivityBenchmark', 'TensorRTInferenceBenchmark', 'DirectXGPUEncodingLatency', 'DirectXGPUCopyBw',
'DirectXGPUMemBw', 'DirectXGPUCoreFlops', 'NvBandwidthBenchmark', 'NvbenchKernelLaunch', 'NvbenchSleepKernel'
Comment thread
WenqingLan1 marked this conversation as resolved.
Outdated
]
45 changes: 45 additions & 0 deletions superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
cmake_minimum_required(VERSION 3.18)
project(nvbench_benchmarks LANGUAGES CUDA)

# Check if we have a recent enough CMake for nvbench (which requires 3.30.4)
if(CMAKE_VERSION VERSION_LESS "3.30.4")
message(STATUS "CMake version ${CMAKE_VERSION} is less than 3.30.4 (required by nvbench), skipping nvbench benchmarks")
return()
endif()
Comment on lines +1 to +8
Copy link

Copilot AI Apr 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This CMakeLists declares project(... LANGUAGES CUDA) before checking the CMake version / CUDA availability. If this directory is configured on a machine without a CUDA toolchain (or when CMake < 3.30.4), configuration can fail before reaching the intended “skip” logic. Consider moving the CMake version guard above project() and using project(... LANGUAGES CXX) + include(cuda_common.cmake)/enable_language(CUDA) only inside the CUDAToolkit_FOUND branch.

Copilot uses AI. Check for mistakes.

find_package(CUDAToolkit QUIET)
if (CUDAToolkit_FOUND)
include(../cuda_common.cmake)

# Try to find nvbench, but don't require it
find_package(nvbench CONFIG QUIET)

if (nvbench_FOUND)
Comment thread
WenqingLan1 marked this conversation as resolved.
message(STATUS "Found nvbench, building nvbench benchmarks")

# list all your CUDA benchmark source files here
set(NVBENCH_SOURCES
kernel_launch.cu
sleep_kernel.cu
# add more *.cu as needed
)

foreach(src ${NVBENCH_SOURCES})
# strip ".cu" → NAME_WE
get_filename_component(basename ${src} NAME_WE)
set(target nvbench_${basename})

add_executable(${target} ${src})
target_compile_features(${target} PUBLIC cuda_std_17)
target_link_libraries(${target}
PRIVATE nvbench::nvbench nvbench::main
)
install(TARGETS ${target} RUNTIME DESTINATION bin)
Comment thread
WenqingLan1 marked this conversation as resolved.
endforeach()
else()
message(STATUS "nvbench not found, skipping nvbench benchmarks.")
message(STATUS "To build nvbench benchmarks, first build the submodule in third_party/nvbench")
endif()
else()
message(STATUS "CUDA not found, skipping nvbench benchmarks.")
endif()
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#include <nvbench/nvbench.cuh>

__global__ void empty_kernel() {}
Comment thread
WenqingLan1 marked this conversation as resolved.

void kernel_launch(nvbench::state &state) {
state.exec([](nvbench::launch &launch) { empty_kernel<<<1, 1, 0, launch.get_stream()>>>(); });
}

NVBENCH_BENCH(kernel_launch);
22 changes: 22 additions & 0 deletions superbench/benchmarks/micro_benchmarks/nvbench/sleep_kernel.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#include <cuda/std/chrono>
#include <cuda_runtime.h>
#include <nvbench/nvbench.cuh>
Comment thread
WenqingLan1 marked this conversation as resolved.

__global__ void sleep_kernel(nvbench::int64_t microseconds) {
const auto start = cuda::std::chrono::high_resolution_clock::now();
const auto target_duration = cuda::std::chrono::microseconds(microseconds);
const auto finish = start + target_duration;

while (cuda::std::chrono::high_resolution_clock::now() < finish) {
// busy wait
}
}

void sleep_benchmark(nvbench::state &state) {
const auto duration_us = state.get_int64("Duration (us)");
state.exec(
[&duration_us](nvbench::launch &launch) { sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(duration_us); });
}
NVBENCH_BENCH(sleep_benchmark)
.add_int64_axis("Duration (us)", nvbench::range(0, 100, 5))
.set_timeout(1); // Limit to one second per measurement.
Comment thread
WenqingLan1 marked this conversation as resolved.
Loading
Loading