microsoft · WenqingLan1 · Jul 22, 2025 · Jul 22, 2025 · Jul 30, 2025 · Aug 25, 2025
@@ -52,7 +52,11 @@ jobs:
       - name: Install Dependency
         run: |
           DEBIAN_FRONTEND=noninteractive apt-get update
-          DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev sudo
+          DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev sudo build-essential
+      - name: Setup CMake
+        uses: lukka/get-cmake@latest
-        uses: lukka/get-cmake@latest
+        uses: lukka/get-cmake@v3.20.0
-        uses: lukka/get-cmake@latest
+        uses: lukka/get-cmake@v3.20.0
+        with:
+          cmakeVersion: '3.20.0'
       - name: Initialize CodeQL
         uses: github/codeql-action/init@v3
         with:

@@ -151,6 +151,9 @@ cython_debug/
 *.userosscache
 *.sln.docstates
 
+# Build temporary files
+compile_commands.json
+
 # Build results
 [Dd]ebug/
 [Dd]ebugPublic/

@@ -33,3 +33,6 @@
 [submodule "third_party/nvbandwidth"]
 	path = third_party/nvbandwidth
 	url = https://github.com/NVIDIA/nvbandwidth.git
+[submodule "third_party/nvbench"]
+	path = third_party/nvbench
+	url = https://github.com/NVIDIA/nvbench.git
@@ -61,6 +61,27 @@ RUN apt-get update && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/* /tmp/*
 
+# Install CMake 3.30.4 for nvbench compatibility
+RUN apt-get update && \
+    apt-get remove -y cmake cmake-data && \
+    apt-get autoremove -y && \
+    cd /tmp && \
+    ARCH=$(uname -m) && \
+    case ${ARCH} in \
+        "aarch64") CMAKE_ARCH="aarch64" ;; \
+        "x86_64") CMAKE_ARCH="x86_64" ;; \
+        "arm64") CMAKE_ARCH="aarch64" ;; \
+        *) CMAKE_ARCH="x86_64" ;; \
+    esac && \
+    echo "Detected architecture: ${ARCH}, using CMAKE_ARCH: ${CMAKE_ARCH}" && \
+    wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
+    tar -xzf cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
+    mv cmake-3.30.4-linux-${CMAKE_ARCH} /opt/cmake && \
+    ln -sf /opt/cmake/bin/* /usr/local/bin/ && \
+    rm -rf cmake-3.30.4-linux-${CMAKE_ARCH}* && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
 ARG NUM_MAKE_JOBS=
 ARG TARGETPLATFORM
 ARG TARGETARCH
@@ -161,7 +182,7 @@ ADD dockerfile/etc /opt/microsoft/
 WORKDIR ${SB_HOME}
 
 ADD third_party third_party
-RUN make -C third_party cuda_with_msccl
+RUN make -C third_party cuda_with_msccl cuda_nvbench
 
 ADD . .
 RUN python3 -m pip install --upgrade setuptools==70.3.0 && \

@@ -62,6 +62,28 @@ RUN apt-get update && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/* /tmp/*
 
+# Install CMake 3.30.4 for nvbench compatibility
+RUN apt-get update && \
+    apt-get remove -y cmake cmake-data && \
+    apt-get autoremove -y && \
+    cd /tmp && \
+    ARCH=$(uname -m) && \
+    case ${ARCH} in \
+        "aarch64") CMAKE_ARCH="aarch64" ;; \
+        "x86_64") CMAKE_ARCH="x86_64" ;; \
+        "arm64") CMAKE_ARCH="aarch64" ;; \
+        *) CMAKE_ARCH="x86_64" ;; \
+    esac && \
+    echo "Detected architecture: ${ARCH}, using CMAKE_ARCH: ${CMAKE_ARCH}" && \
+    wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
+    tar -xzf cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
+    mv cmake-3.30.4-linux-${CMAKE_ARCH} /opt/cmake && \
+    ln -sf /opt/cmake/bin/* /usr/local/bin/ && \
+    rm -rf cmake-3.30.4-linux-${CMAKE_ARCH}* && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+
 ARG NUM_MAKE_JOBS=
 ARG TARGETPLATFORM
 ARG TARGETARCH
@@ -162,7 +184,7 @@ ADD dockerfile/etc /opt/microsoft/
 WORKDIR ${SB_HOME}
 
 ADD third_party third_party
-RUN make -C third_party cuda_with_msccl
+RUN make -C third_party cuda_with_msccl cuda_nvbench
 
 ADD . .
 RUN python3 -m pip install --upgrade setuptools==78.1.0 && \

@@ -151,7 +151,7 @@ ADD dockerfile/etc /opt/microsoft/
 WORKDIR ${SB_HOME}
 
-
+
+# nvbench requires CMake >= 3.30.4; ensure an adequate version is available
+# before building the cuda_nvbench third-party target.
+RUN python3 -m pip install --no-cache-dir --upgrade cmake==3.30.4
-
+
+# nvbench requires CMake >= 3.30.4; ensure an adequate version is available
+# before building the cuda_nvbench third-party target.
+RUN python3 -m pip install --no-cache-dir --upgrade cmake==3.30.4
 ADD third_party third_party
-RUN make -C third_party cuda
+RUN make -C third_party cuda cuda_nvbench
 
 ADD . .
 RUN python3 -m pip install --upgrade setuptools==78.1.0 && \

@@ -172,6 +172,105 @@ Supports the use of double unit types and the use of tensor cores.
 | gpu-burn/gpu_[0-9]_pass | yes/no   | The result of the gpu-burn test for each GPU (1: yes, 0: no).                      |
 | gpu-burn/abort          | yes/no   | Whether or not GPU-burn test aborted before returning GPU results (1: yes, 0: no). |
 
+### `nvbench-sleep-kernel`
+
+#### Introduction
+
+Measure GPU kernel execution time using NVBench's sleep kernel benchmark. This benchmark creates CUDA kernels that sleep for specified durations (in microseconds) and measures the actual execution time, providing insights into GPU scheduling overhead and timing accuracy.
+
+The benchmark supports multiple duration specification formats:
+- Single value: `"50"` - Test single duration of 50μs
+- List format: `"[25,50,75]"` - Test multiple specific durations
+- Range format: `"[25:75]"` - Test all values from 25μs to 75μs  
+- Range with step: `"[0:50:10]"` - Test from 0μs to 50μs in steps of 10μs
+
+Performed by [NVBench](https://github.com/NVIDIA/nvbench) sleep kernel benchmark.
+
+#### Metrics
+
+| Name                                    | Unit      | Description                                           |
+|-----------------------------------------|-----------|-------------------------------------------------------|
+| nvbench-sleep-kernel/duration_us_{X}_cpu_time     | time (μs) | CPU-measured time for duration X microseconds.       |
+| nvbench-sleep-kernel/duration_us_{X}_gpu_time     | time (μs) | GPU-measured time for duration X microseconds.       |
+| nvbench-sleep-kernel/duration_us_{X}_batch_gpu_time | time (μs) | GPU batch execution time for duration X microseconds. |
+
+Where `{X}` is the sleep duration in microseconds (e.g., 25, 50, 75).
+
+### `nvbench-kernel-launch`
+
+#### Introduction
+
+Measure GPU kernel launch overhead and execution time using NVBench's kernel launch benchmark. This benchmark evaluates the time required to launch kernels on the GPU and measures both CPU-side and GPU-side timing for kernel execution.
+
+The benchmark provides insights into:
+- Kernel launch latency
+- CPU/GPU synchronization overhead  
+- Batch execution performance
+
+Performed by [NVBench](https://github.com/NVIDIA/nvbench) kernel launch benchmark.
+
+#### Comparison with `kernel-launch`
+
+Both `nvbench-kernel-launch` and `kernel-launch` measure kernel launch latency, but they differ in methodology:
+
+| Aspect | `kernel-launch` | `nvbench-kernel-launch` |
+|--------|-----------------|-------------------------|
+| L2 Cache | Warm (cached) | Cold (flushed before each sample) |
+| Measurement | Warm-cache / steady-state | Cold-cache / first-access |
+| Iterations | Fixed 2M iterations | Adaptive (statistical stopping) |
+
+**Important: Do not cross-compare results between these two benchmarks.** For performance regression detection, either benchmark works well. Always compare against historical data from the same benchmark.
+
+Choose based on what scenario matters for your workload:
+- **`kernel-launch`**: Measures warm-cache performance, reflecting steady-state behavior in long-running applications where caches are typically hot
+- **`nvbench-kernel-launch`**: Measures cold-cache performance, reflecting first-access scenarios or workloads with poor cache locality
+
+#### Metrics
+
+| Name                                | Unit      | Description                                    |
+|-------------------------------------|-----------|------------------------------------------------|
+| nvbench-kernel-launch/cpu_time      | time (μs) | CPU-measured kernel execution time.            |
+| nvbench-kernel-launch/gpu_time      | time (μs) | GPU-measured kernel execution time.            |
+| nvbench-kernel-launch/batch_gpu_time | time (μs) | GPU batch execution time.                     |
+
+### `nvbench-auto-throughput`
+
+#### Introduction
+
+Measure GPU memory throughput and efficiency metrics using NVBench's auto throughput benchmark. This benchmark copies a 128 MiB buffer of int32 values with configurable stride and block size parameters, measuring memory bandwidth efficiency and CUPTI-based performance counters.
+
+#### Parameters
+
+- **Stride**: Controls the memory access pattern by specifying the gap between consecutive memory accesses. A stride of 1 means contiguous (coalesced) memory access. Larger stride values (2, 4, 8, etc.) create non-contiguous access patterns, useful for stress-testing memory subsystem behavior under different access patterns.
+- **BlockSize**: The number of threads per CUDA block (e.g., 128, 256, 512, 1024). Different block sizes affect occupancy and scheduling efficiency.
+- **ItemsPerThread**: The number of elements each thread processes (1 or 2). Higher values increase work per thread.
+
+The benchmark supports multiple parameter specification formats:
+- Single value: `"2"` - Test single value
+- List format: `"[1,2,4,8]"` - Test multiple specific values
+- Range format: `"[1:4]"` - Test all values from 1 to 4
+- Range with step: `"[1:8:2]"` - Test from 1 to 8 in steps of 2
+
+Performed by [NVBench](https://github.com/NVIDIA/nvbench) auto throughput benchmark with CUPTI metrics collection.
+
+#### Metrics
+
+| Name                                                                    | Unit         | Description                                                                           |
+|-------------------------------------------------------------------------|--------------|--------------------------------------------------------------------------------------|
+| nvbench-auto-throughput/ipt\_{T}\_stride\_{S}\_blk\_{B}\_cpu\_time       | time (μs)    | CPU-measured execution time.             |
+| nvbench-auto-throughput/ipt\_{T}\_stride\_{S}\_blk\_{B}\_gpu\_time       | time (μs)    | GPU-measured execution time.             |
+| nvbench-auto-throughput/ipt\_{T}\_stride\_{S}\_blk\_{B}\_batch\_gpu\_time | time (μs)    | GPU batch execution time.                |
+| nvbench-auto-throughput/ipt\_{T}\_stride\_{S}\_blk\_{B}\_hbw\_peak       | percent (%)  | HBM peak bandwidth utilization percentage.                                            |
+| nvbench-auto-throughput/ipt\_{T}\_stride\_{S}\_blk\_{B}\_load\_eff       | percent (%)  | Global memory load efficiency percentage.                                             |
+| nvbench-auto-throughput/ipt\_{T}\_stride\_{S}\_blk\_{B}\_store\_eff      | percent (%)  | Global memory store efficiency percentage.                                            |
+| nvbench-auto-throughput/ipt\_{T}\_stride\_{S}\_blk\_{B}\_l1\_hit\_rate   | percent (%)  | L1 cache hit rate percentage (informational only, excluded from pass/fail criteria). |
+| nvbench-auto-throughput/ipt\_{T}\_stride\_{S}\_blk\_{B}\_l2\_hit\_rate   | percent (%)  | L2 cache hit rate percentage (informational only, excluded from pass/fail criteria). |
+| nvbench-auto-throughput/ipt\_{T}\_stride\_{S}\_blk\_{B}\_throughput       | GB/s         | Memory throughput calculated from element rate (elements/s × 4 bytes for int32).     |
+
+Where `{T}` is ItemsPerThread (1 or 2), `{S}` is Stride value, and `{B}` is BlockSize (e.g., 128, 256, 512, 1024).
+
+> **Note:** L1 and L2 cache hit rates are collected for informational purposes only and should not be used for performance validation pass/fail criteria, as cache behavior can vary significantly based on system state and workload characteristics.
+
 ### `cpu-hpl`
 
 #### Introduction

@@ -0,0 +1,26 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Micro benchmark example for NVBench Auto Throughput.
+
+Commands to run:
+  python3 examples/benchmarks/nvbench_auto_throughput.py
+"""
+
+from superbench.benchmarks import BenchmarkRegistry, Platform
+from superbench.common.utils import logger
+
+if __name__ == '__main__':
+    context = BenchmarkRegistry.create_benchmark_context(
+        'nvbench-auto-throughput',
+        platform=Platform.CUDA,
+        parameters='--devices 0 --stride "[1,2,4,8]" --block_size "[256,512]" --timeout 30'
+    )
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    if benchmark:
+        logger.info(
+            'benchmark: {}, return code: {}, result: {}'.format(
+                benchmark.name, benchmark.return_code, benchmark.result
+            )
+        )
@@ -0,0 +1,34 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Micro benchmark example for NVBench Kernel Launch.
+
+Commands to run:
+  python3 examples/benchmarks/nvbench_kernel_launch.py
+"""
+
+from superbench.benchmarks import BenchmarkRegistry, Platform
+from superbench.common.utils import logger
+
+if __name__ == '__main__':
+    context = BenchmarkRegistry.create_benchmark_context(
+        'nvbench-kernel-launch',
+        platform=Platform.CUDA,
+        parameters=(
+            '--timeout 30 '
+            '--min-samples 10 '
+            '--min-time 1.0 '
+            '--max-noise 0.1 '
+            '--stopping-criterion stdrel '
+            '--throttle-threshold 80 '
+            '--throttle-recovery-delay 1.0'
+        )
+    )
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    if benchmark:
+        logger.info(
+            'benchmark: {}, return code: {}, result: {}'.format(
+                benchmark.name, benchmark.return_code, benchmark.result
+            )
+        )
@@ -0,0 +1,24 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Micro benchmark example for NVBench Sleep Kernel.
+
+Commands to run:
+  python3 examples/benchmarks/nvbench_sleep_kernel.py
+"""
+
+from superbench.benchmarks import BenchmarkRegistry, Platform
+from superbench.common.utils import logger
+
+if __name__ == '__main__':
+    context = BenchmarkRegistry.create_benchmark_context(
+        'nvbench-sleep-kernel', platform=Platform.CUDA, parameters='--duration_us "[25,50,75]" --timeout 10'
+    )
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    if benchmark:
+        logger.info(
+            'benchmark: {}, return code: {}, result: {}'.format(
+                benchmark.name, benchmark.return_code, benchmark.result
+            )
+        )
@@ -39,42 +39,18 @@
 from superbench.benchmarks.micro_benchmarks.directx_mem_bw_performance import DirectXGPUMemBw
 from superbench.benchmarks.micro_benchmarks.directx_gemm_flops_performance import DirectXGPUCoreFlops
 from superbench.benchmarks.micro_benchmarks.nvbandwidth import NvBandwidthBenchmark
+from superbench.benchmarks.micro_benchmarks.nvbench_kernel_launch import NvbenchKernelLaunch
+from superbench.benchmarks.micro_benchmarks.nvbench_sleep_kernel import NvbenchSleepKernel
+from superbench.benchmarks.micro_benchmarks.nvbench_auto_throughput import NvbenchAutoThroughput
 
 __all__ = [
-    'BlasLtBaseBenchmark',
-    'ComputationCommunicationOverlap',
-    'CpuMemBwLatencyBenchmark',
-    'CpuHplBenchmark',
-    'CpuStreamBenchmark',
-    'CublasBenchmark',
-    'CublasLtBenchmark',
-    'CudaGemmFlopsBenchmark',
-    'CudaMemBwBenchmark',
-    'CudaNcclBwBenchmark',
-    'CudnnBenchmark',
-    'DiskBenchmark',
-    'DistInference',
-    'HipBlasLtBenchmark',
-    'GPCNetBenchmark',
-    'GemmFlopsBenchmark',
-    'GpuBurnBenchmark',
-    'GpuCopyBwBenchmark',
-    'GpuStreamBenchmark',
-    'IBBenchmark',
-    'IBLoopbackBenchmark',
-    'KernelLaunch',
-    'MemBwBenchmark',
-    'MicroBenchmark',
-    'MicroBenchmarkWithInvoke',
-    'ORTInferenceBenchmark',
-    'RocmGemmFlopsBenchmark',
-    'RocmMemBwBenchmark',
-    'ShardingMatmul',
-    'TCPConnectivityBenchmark',
-    'TensorRTInferenceBenchmark',
-    'DirectXGPUEncodingLatency',
-    'DirectXGPUCopyBw',
-    'DirectXGPUMemBw',
-    'DirectXGPUCoreFlops',
-    'NvBandwidthBenchmark',
+    'BlasLtBaseBenchmark', 'ComputationCommunicationOverlap', 'CpuMemBwLatencyBenchmark', 'CpuHplBenchmark',
+    'CpuStreamBenchmark', 'CublasBenchmark', 'CublasLtBenchmark', 'CudaGemmFlopsBenchmark', 'CudaMemBwBenchmark',
+    'CudaNcclBwBenchmark', 'CudnnBenchmark', 'DiskBenchmark', 'DistInference', 'HipBlasLtBenchmark', 'GPCNetBenchmark',
+    'GemmFlopsBenchmark', 'GpuBurnBenchmark', 'GpuCopyBwBenchmark', 'GpuStreamBenchmark', 'IBBenchmark',
+    'IBLoopbackBenchmark', 'KernelLaunch', 'MemBwBenchmark', 'MicroBenchmark', 'MicroBenchmarkWithInvoke',
+    'ORTInferenceBenchmark', 'RocmGemmFlopsBenchmark', 'RocmMemBwBenchmark', 'ShardingMatmul',
+    'TCPConnectivityBenchmark', 'TensorRTInferenceBenchmark', 'DirectXGPUEncodingLatency', 'DirectXGPUCopyBw',
+    'DirectXGPUMemBw', 'DirectXGPUCoreFlops', 'NvBandwidthBenchmark', 'NvbenchKernelLaunch', 'NvbenchSleepKernel',
+    'NvbenchAutoThroughput'
 ]