microsoft · WenqingLan1 · Jul 22, 2025 · Jul 22, 2025 · Jul 30, 2025 · Aug 25, 2025
@@ -52,7 +52,11 @@ jobs:
       - name: Install Dependency
         run: |
           DEBIAN_FRONTEND=noninteractive apt-get update
-          DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev sudo
+          DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev sudo build-essential
+      - name: Setup CMake
+        uses: lukka/get-cmake@latest
-        uses: lukka/get-cmake@latest
+        uses: lukka/get-cmake@v3.20.0
-        uses: lukka/get-cmake@latest
+        uses: lukka/get-cmake@v3.20.0
+        with:
+          cmakeVersion: '3.20.0'
       - name: Initialize CodeQL
         uses: github/codeql-action/init@v3
         with:

@@ -151,6 +151,9 @@ cython_debug/
 *.userosscache
 *.sln.docstates
 
+# Build temporary files
+compile_commands.json
+
 # Build results
 [Dd]ebug/
 [Dd]ebugPublic/

@@ -33,3 +33,6 @@
 [submodule "third_party/nvbandwidth"]
 	path = third_party/nvbandwidth
 	url = https://github.com/NVIDIA/nvbandwidth.git
+[submodule "third_party/nvbench"]
+	path = third_party/nvbench
+	url = https://github.com/NVIDIA/nvbench.git
@@ -61,6 +61,27 @@ RUN apt-get update && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/* /tmp/*
 
+# Install CMake 3.30.4 for nvbench compatibility
+RUN apt-get update && \
+    apt-get remove -y cmake cmake-data && \
+    apt-get autoremove -y && \
+    cd /tmp && \
+    ARCH=$(uname -m) && \
+    case ${ARCH} in \
+        "aarch64") CMAKE_ARCH="aarch64" ;; \
+        "x86_64") CMAKE_ARCH="x86_64" ;; \
+        "arm64") CMAKE_ARCH="aarch64" ;; \
+        *) CMAKE_ARCH="x86_64" ;; \
+    esac && \
+    echo "Detected architecture: ${ARCH}, using CMAKE_ARCH: ${CMAKE_ARCH}" && \
+    wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
+    tar -xzf cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
+    mv cmake-3.30.4-linux-${CMAKE_ARCH} /opt/cmake && \
+    ln -sf /opt/cmake/bin/* /usr/local/bin/ && \
+    rm -rf cmake-3.30.4-linux-${CMAKE_ARCH}* && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
 ARG NUM_MAKE_JOBS=
 ARG TARGETPLATFORM
 ARG TARGETARCH
@@ -161,7 +182,7 @@ ADD dockerfile/etc /opt/microsoft/
 WORKDIR ${SB_HOME}
 
 ADD third_party third_party
-RUN make -C third_party cuda_with_msccl
+RUN make -C third_party cuda_with_msccl cuda_nvbench
 
 ADD . .
 RUN python3 -m pip install --upgrade setuptools==70.3.0 && \

@@ -62,6 +62,28 @@ RUN apt-get update && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/* /tmp/*
 
+# Install CMake 3.30.4 for nvbench compatibility
+RUN apt-get update && \
+    apt-get remove -y cmake cmake-data && \
+    apt-get autoremove -y && \
+    cd /tmp && \
+    ARCH=$(uname -m) && \
+    case ${ARCH} in \
+        "aarch64") CMAKE_ARCH="aarch64" ;; \
+        "x86_64") CMAKE_ARCH="x86_64" ;; \
+        "arm64") CMAKE_ARCH="aarch64" ;; \
+        *) CMAKE_ARCH="x86_64" ;; \
+    esac && \
+    echo "Detected architecture: ${ARCH}, using CMAKE_ARCH: ${CMAKE_ARCH}" && \
+    wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
+    tar -xzf cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
+    mv cmake-3.30.4-linux-${CMAKE_ARCH} /opt/cmake && \
+    ln -sf /opt/cmake/bin/* /usr/local/bin/ && \
+    rm -rf cmake-3.30.4-linux-${CMAKE_ARCH}* && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+
 ARG NUM_MAKE_JOBS=
 ARG TARGETPLATFORM
 ARG TARGETARCH
@@ -162,7 +184,7 @@ ADD dockerfile/etc /opt/microsoft/
 WORKDIR ${SB_HOME}
 
 ADD third_party third_party
-RUN make -C third_party cuda_with_msccl
+RUN make -C third_party cuda_with_msccl cuda_nvbench
 
 ADD . .
 RUN python3 -m pip install --upgrade setuptools==78.1.0 && \

@@ -62,6 +62,27 @@ RUN apt-get update && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/* /tmp/*
 
+# Install CMake 3.30.4 for nvbench compatibility
+RUN apt-get update && \
+    apt-get remove -y cmake cmake-data && \
+    apt-get autoremove -y && \
+    cd /tmp && \
+    ARCH=$(uname -m) && \
+    case ${ARCH} in \
+        "aarch64") CMAKE_ARCH="aarch64" ;; \
+        "x86_64") CMAKE_ARCH="x86_64" ;; \
+        "arm64") CMAKE_ARCH="aarch64" ;; \
+        *) CMAKE_ARCH="x86_64" ;; \
+    esac && \
+    echo "Detected architecture: ${ARCH}, using CMAKE_ARCH: ${CMAKE_ARCH}" && \
+    wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
+    tar -xzf cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
+    mv cmake-3.30.4-linux-${CMAKE_ARCH} /opt/cmake && \
+    ln -sf /opt/cmake/bin/* /usr/local/bin/ && \
+    rm -rf cmake-3.30.4-linux-${CMAKE_ARCH}* && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
 ARG NUM_MAKE_JOBS=
 ARG TARGETPLATFORM
 ARG TARGETARCH
@@ -151,7 +172,7 @@ ADD dockerfile/etc /opt/microsoft/
 WORKDIR ${SB_HOME}
 
 ADD third_party third_party
-RUN make -C third_party cuda
+RUN make -C third_party cuda cuda_nvbench
 
 ADD . .
 RUN python3 -m pip install --upgrade setuptools==78.1.0 && \

diff --git a/dockerfile/rocm5.0.x.dockerfile b/dockerfile/rocm5.0.x.dockerfile
@@ -98,7 +98,7 @@ RUN cd /tmp && \
 
 # Install Intel MLC
 RUN cd /tmp && \
-    wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
+    wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
     tar xzf mlc.tgz Linux/mlc && \
     cp ./Linux/mlc /usr/local/bin/ && \
     rm -rf ./Linux mlc.tgz

@@ -172,6 +172,51 @@ Supports the use of double unit types and the use of tensor cores.
 | gpu-burn/gpu_[0-9]_pass | yes/no   | The result of the gpu-burn test for each GPU (1: yes, 0: no).                      |
 | gpu-burn/abort          | yes/no   | Whether or not GPU-burn test aborted before returning GPU results (1: yes, 0: no). |
 
+### `nvbench-sleep-kernel`
+
+#### Introduction
+
+Measure GPU kernel execution time using NVBench's sleep kernel benchmark. This benchmark creates CUDA kernels that sleep for specified durations (in microseconds) and measures the actual execution time, providing insights into GPU scheduling overhead and timing accuracy.
+
+The benchmark supports multiple duration specification formats:
+- Single value: `"50"` - Test single duration of 50μs
+- List format: `"[25,50,75]"` - Test multiple specific durations
+- Range format: `"[25:75]"` - Test all values from 25μs to 75μs  
+- Range with step: `"[0:50:10]"` - Test from 0μs to 50μs in steps of 10μs
+
+Performed by [NVBench](https://github.com/NVIDIA/nvbench) sleep kernel benchmark.
+
+#### Metrics
+
+| Name                                    | Unit      | Description                                           |
+|-----------------------------------------|-----------|-------------------------------------------------------|
+| nvbench-sleep-kernel/duration_us_{X}_cpu_time     | time (μs) | CPU-measured time for duration X microseconds.       |
+| nvbench-sleep-kernel/duration_us_{X}_gpu_time     | time (μs) | GPU-measured time for duration X microseconds.       |
+| nvbench-sleep-kernel/duration_us_{X}_batch_gpu_time | time (μs) | GPU batch execution time for duration X microseconds. |
+
+Where `{X}` is the sleep duration in microseconds (e.g., 25, 50, 75).
+
+### `nvbench-kernel-launch`
+
+#### Introduction
+
+Measure GPU kernel launch overhead and execution time using NVBench's kernel launch benchmark. This benchmark evaluates the time required to launch kernels on the GPU and measures both CPU-side and GPU-side timing for kernel execution.
+
+The benchmark provides insights into:
+- Kernel launch latency
+- CPU/GPU synchronization overhead  
+- Batch execution performance
+
+Performed by [NVBench](https://github.com/NVIDIA/nvbench) kernel launch benchmark.
+
+#### Metrics
+
+| Name                                | Unit      | Description                                    |
+|-------------------------------------|-----------|------------------------------------------------|
+| nvbench-kernel-launch/cpu_time      | time (μs) | CPU-measured kernel execution time.            |
+| nvbench-kernel-launch/gpu_time      | time (μs) | GPU-measured kernel execution time.            |
+| nvbench-kernel-launch/batch_gpu_time | time (μs) | GPU batch execution time.                     |
+
 ### `cpu-hpl`
 
 #### Introduction

@@ -0,0 +1,30 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Example of NVBench Kernel Launch benchmark."""
+
+from superbench.benchmarks import BenchmarkRegistry, Platform
+from superbench.common.utils import logger
+
+if __name__ == '__main__':
+    context = BenchmarkRegistry.create_benchmark_context(
+        'nvbench-kernel-launch',
+        platform=Platform.CUDA,
+        parameters=(
+            '--timeout 30 '
+            '--min-samples 10 '
+            '--min-time 1.0 '
+            '--max-noise 0.1 '
+            '--stopping-criterion stdrel '
+            '--throttle-threshold 80 '
+            '--throttle-recovery-delay 1.0'
+        )
+    )
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    if benchmark:
+        logger.info(
+            'benchmark: {}, return code: {}, result: {}'.format(
+                benchmark.name, benchmark.return_code, benchmark.result
+            )
+        )
@@ -0,0 +1,28 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Example of NVBench Sleep Kernel benchmark."""
+
+from superbench.benchmarks import BenchmarkRegistry, Platform
+from superbench.common.utils import logger
+
+
+def main():
+    """Main method to run the nvbench sleep kernel benchmark."""
+    context = BenchmarkRegistry.create_benchmark_context(
+        'nvbench-sleep-kernel', platform=Platform.CUDA, parameters='--duration_us "[25,50,75]" --timeout 10'
+    )
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    if benchmark:
+        logger.info(
+            'benchmark: {}, return code: {}, result: {}'.format(
+                benchmark.name, benchmark.return_code, benchmark.result
+            )
+        )
+    else:
+        logger.error('benchmark: nvbench-sleep-kernel launch failed.')
+
+
+if __name__ == '__main__':
+    main()
@@ -39,42 +39,16 @@
 from superbench.benchmarks.micro_benchmarks.directx_mem_bw_performance import DirectXGPUMemBw
 from superbench.benchmarks.micro_benchmarks.directx_gemm_flops_performance import DirectXGPUCoreFlops
 from superbench.benchmarks.micro_benchmarks.nvbandwidth import NvBandwidthBenchmark
+from superbench.benchmarks.micro_benchmarks.nvbench_kernel_launch import NvbenchKernelLaunch
+from superbench.benchmarks.micro_benchmarks.nvbench_sleep_kernel import NvbenchSleepKernel
 
 __all__ = [
-    'BlasLtBaseBenchmark',
-    'ComputationCommunicationOverlap',
-    'CpuMemBwLatencyBenchmark',
-    'CpuHplBenchmark',
-    'CpuStreamBenchmark',
-    'CublasBenchmark',
-    'CublasLtBenchmark',
-    'CudaGemmFlopsBenchmark',
-    'CudaMemBwBenchmark',
-    'CudaNcclBwBenchmark',
-    'CudnnBenchmark',
-    'DiskBenchmark',
-    'DistInference',
-    'HipBlasLtBenchmark',
-    'GPCNetBenchmark',
-    'GemmFlopsBenchmark',
-    'GpuBurnBenchmark',
-    'GpuCopyBwBenchmark',
-    'GpuStreamBenchmark',
-    'IBBenchmark',
-    'IBLoopbackBenchmark',
-    'KernelLaunch',
-    'MemBwBenchmark',
-    'MicroBenchmark',
-    'MicroBenchmarkWithInvoke',
-    'ORTInferenceBenchmark',
-    'RocmGemmFlopsBenchmark',
-    'RocmMemBwBenchmark',
-    'ShardingMatmul',
-    'TCPConnectivityBenchmark',
-    'TensorRTInferenceBenchmark',
-    'DirectXGPUEncodingLatency',
-    'DirectXGPUCopyBw',
-    'DirectXGPUMemBw',
-    'DirectXGPUCoreFlops',
-    'NvBandwidthBenchmark',
+    'BlasLtBaseBenchmark', 'ComputationCommunicationOverlap', 'CpuMemBwLatencyBenchmark', 'CpuHplBenchmark',
+    'CpuStreamBenchmark', 'CublasBenchmark', 'CublasLtBenchmark', 'CudaGemmFlopsBenchmark', 'CudaMemBwBenchmark',
+    'CudaNcclBwBenchmark', 'CudnnBenchmark', 'DiskBenchmark', 'DistInference', 'HipBlasLtBenchmark', 'GPCNetBenchmark',
+    'GemmFlopsBenchmark', 'GpuBurnBenchmark', 'GpuCopyBwBenchmark', 'GpuStreamBenchmark', 'IBBenchmark',
+    'IBLoopbackBenchmark', 'KernelLaunch', 'MemBwBenchmark', 'MicroBenchmark', 'MicroBenchmarkWithInvoke',
+    'ORTInferenceBenchmark', 'RocmGemmFlopsBenchmark', 'RocmMemBwBenchmark', 'ShardingMatmul',
+    'TCPConnectivityBenchmark', 'TensorRTInferenceBenchmark', 'DirectXGPUEncodingLatency', 'DirectXGPUCopyBw',
+    'DirectXGPUMemBw', 'DirectXGPUCoreFlops', 'NvBandwidthBenchmark', 'NvbenchKernelLaunch', 'NvbenchSleepKernel'
 ]
@@ -0,0 +1,45 @@
+cmake_minimum_required(VERSION 3.18)
+project(nvbench_benchmarks LANGUAGES CUDA)
+
+# Check if we have a recent enough CMake for nvbench (which requires 3.30.4)
+if(CMAKE_VERSION VERSION_LESS "3.30.4")
+  message(STATUS "CMake version ${CMAKE_VERSION} is less than 3.30.4 (required by nvbench), skipping nvbench benchmarks")
+  return()
+endif()
+
+find_package(CUDAToolkit QUIET)
+if (CUDAToolkit_FOUND)
+  include(../cuda_common.cmake)
+
+  # Try to find nvbench, but don't require it
+  find_package(nvbench CONFIG QUIET)
+
+  if (nvbench_FOUND)
+    message(STATUS "Found nvbench, building nvbench benchmarks")
+
+    # list all your CUDA benchmark source files here
+    set(NVBENCH_SOURCES
+      kernel_launch.cu
+      sleep_kernel.cu
+      # add more *.cu as needed
+    )
+
+    foreach(src ${NVBENCH_SOURCES})
+      # strip ".cu" → NAME_WE
+      get_filename_component(basename ${src} NAME_WE)
+      set(target nvbench_${basename})
+
+      add_executable(${target} ${src})
+      target_compile_features(${target} PUBLIC cuda_std_17)
+      target_link_libraries(${target}
+        PRIVATE nvbench::nvbench nvbench::main
+      )
+      install(TARGETS ${target} RUNTIME DESTINATION bin)
+    endforeach()
+  else()
+    message(STATUS "nvbench not found, skipping nvbench benchmarks.")
+    message(STATUS "To build nvbench benchmarks, first build the submodule in third_party/nvbench")
+  endif()
+else()
+  message(STATUS "CUDA not found, skipping nvbench benchmarks.")
+endif()
@@ -0,0 +1,9 @@
+#include <nvbench/nvbench.cuh>
+
+__global__ void empty_kernel() {}
+
+void kernel_launch(nvbench::state &state) {
+    state.exec([](nvbench::launch &launch) { empty_kernel<<<1, 1, 0, launch.get_stream()>>>(); });
+}
+
+NVBENCH_BENCH(kernel_launch);
@@ -0,0 +1,22 @@
+#include <cuda/std/chrono>
+#include <cuda_runtime.h>
+#include <nvbench/nvbench.cuh>
+
+__global__ void sleep_kernel(nvbench::int64_t microseconds) {
+    const auto start = cuda::std::chrono::high_resolution_clock::now();
+    const auto target_duration = cuda::std::chrono::microseconds(microseconds);
+    const auto finish = start + target_duration;
+
+    while (cuda::std::chrono::high_resolution_clock::now() < finish) {
+        // busy wait
+    }
+}
+
+void sleep_benchmark(nvbench::state &state) {
+    const auto duration_us = state.get_int64("Duration (us)");
+    state.exec(
+        [&duration_us](nvbench::launch &launch) { sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(duration_us); });
+}
+NVBENCH_BENCH(sleep_benchmark)
+    .add_int64_axis("Duration (us)", nvbench::range(0, 100, 5))
+    .set_timeout(1); // Limit to one second per measurement.