Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,19 @@ set(example_srcs
enums.cu
exec_tag_sync.cu
exec_tag_timer.cu
frequency_criterion.cu
skip.cu
stream.cu
summaries.cu
throughput.cu
)

# These examples use custom stopping criteria that don't support the --min-time argument:
set(no_min_time_examples
custom_criterion.cu
frequency_criterion.cu
)

# Metatarget for all examples:
add_custom_target(nvbench.example.all)
add_dependencies(nvbench.all nvbench.example.all)
Expand All @@ -30,8 +37,8 @@ function (nvbench_add_examples_target target_prefix cuda_std)
set_target_properties(${example_name} PROPERTIES COMPILE_FEATURES cuda_std_${cuda_std})

set(example_args --timeout 0.1)
# The custom_criterion example doesn't support the --min-time argument:
if (NOT "${example_src}" STREQUAL "custom_criterion.cu")
# Some examples use custom criteria that don't support the --min-time argument:
if (NOT "${example_src}" IN_LIST no_min_time_examples)
list(APPEND example_args --min-time 1e-5)
endif()

Expand Down
120 changes: 120 additions & 0 deletions examples/frequency_criterion.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
/*
* Copyright 2026 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
* the License.
*
* You may obtain a copy of the License at
*
* http://llvm.org/foundation/relicensing/LICENSE.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <nvbench/nvbench.cuh>

// Grab some testing kernels from NVBench:
#include <nvbench/test_kernels.cuh>

// Thrust vectors simplify memory management:
#include <thrust/device_vector.h>

#include <stdexcept>

// This example shows how to write a stopping criterion that *requires* the GPU
// clock frequency that NVBench observes for every cold-measurement sample.
//
// In addition to `do_add_measurement()`, a criterion may override
// `do_add_frequency()` to receive the SM clock rate (in Hz) measured during the
// sample. NVBench calls `add_frequency()` immediately before `add_measurement()`
// for the same sample -- but only when it can measure the clock. It is NOT
// called while profiling (the `--profile` option) or for CPU-only benchmarks
// (`nvbench::exec_tag::cpu_only` / `no_gpu`).
//
// Like the `fixed` criterion in `custom_criterion.cu`, this one simply runs for
// a fixed number of samples. The difference is that it also collects the
// per-sample frequency and throws if a sample arrives without one. The thrown
// exception is caught per-benchmark by NVBench and reported as a failure, so
// running this benchmark with `--profile` produces a clear error instead of
// silently ignoring the missing frequency.

// Inherit from the stopping_criterion_base class:
class frequency_criterion final : public nvbench::stopping_criterion_base
{
nvbench::int64_t m_num_samples{};
bool m_has_frequency{false};

public:
frequency_criterion()
: nvbench::stopping_criterion_base{"frequency", {{"max-samples", nvbench::int64_t{42}}}}
{}

protected:
// Setup the criterion in the `do_initialize()` method:
virtual void do_initialize() override
{
m_num_samples = 0;
m_has_frequency = false;
}

// Collect the GPU clock frequency for the current sample. NVBench calls this
// before `do_add_measurement()` whenever a frequency is available:
virtual void do_add_frequency(nvbench::float32_t /* frequency_hz */) override
{
m_has_frequency = true;
}

// Process new measurements in the `do_add_measurement()` method:
virtual void do_add_measurement(nvbench::float64_t /* measurement */) override
{
// This criterion requires a frequency for every sample. NVBench calls
// `do_add_frequency()` before `do_add_measurement()` when one is available,
// so a missing frequency here means none was provided for this sample:
if (!m_has_frequency)
{
throw std::runtime_error(
"frequency_criterion requires a GPU clock frequency for every sample, but none was "
"provided. NVBench does not measure the clock frequency when profiling (--profile) or for "
"CPU-only benchmarks (nvbench::exec_tag::cpu_only / no_gpu).");
}

m_has_frequency = false; // consume it; the next sample must provide its own
m_num_samples++;
}

// Check if the stopping criterion is met in the `do_is_finished()` method:
virtual bool do_is_finished() override
{
return m_num_samples >= m_params.get_int64("max-samples");
}
};

// Register the criterion with NVBench:
NVBENCH_REGISTER_CRITERION(frequency_criterion);

void throughput_bench(nvbench::state &state)
{
// Allocate input data:
const std::size_t num_values = 64 * 1024 * 1024 / sizeof(nvbench::int32_t);
thrust::device_vector<nvbench::int32_t> input(num_values);
thrust::device_vector<nvbench::int32_t> output(num_values);

// Provide throughput information:
state.add_element_count(num_values, "NumElements");
state.add_global_memory_reads<nvbench::int32_t>(num_values, "DataSize");
state.add_global_memory_writes<nvbench::int32_t>(num_values);

state.exec([&input, &output, num_values](nvbench::launch &launch) {
(void)num_values; // clang thinks this is unused...
nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(
thrust::raw_pointer_cast(input.data()),
thrust::raw_pointer_cast(output.data()),
num_values);
});
}
NVBENCH_BENCH(throughput_bench).set_stopping_criterion("frequency");
2 changes: 2 additions & 0 deletions nvbench/detail/measure_cold.cu
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,8 @@ void measure_cold_base::record_measurements()

m_sm_clock_rates.push_back(current_clock_rate);
m_sm_clock_rate_accumulator += current_clock_rate;

m_stopping_criterion.add_frequency(current_clock_rate);
}

// Update and record timers and counters:
Expand Down
13 changes: 13 additions & 0 deletions nvbench/stopping_criterion.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,14 @@ public:
this->do_initialize();
}

/**
* Provide the GPU clock frequency (Hz) observed for the current sample. It is not called when
* doing CPU-only benchmarking (i.e.: using `nvbench::exec_tag::cpu_only` or
* `nvbench::exec_tag::no_gpu`) or profiling (`--profile` option). When called, it is done before
* calling `add_measurement` for the same sample.
*/
void add_frequency(nvbench::float32_t frequency_hz) { this->do_add_frequency(frequency_hz); }

/**
* Add the latest measurement to the criterion
*/
Expand All @@ -134,6 +142,11 @@ protected:
* Check if the criterion has been met for all measurements processed by `add_measurement`
*/
virtual bool do_is_finished() = 0;

/**
* Receive the GPU clock frequency (Hz) for the current sample. Default no-op.
*/
virtual void do_add_frequency(nvbench::float32_t /*frequency_hz*/) {}
};

} // namespace nvbench
1 change: 1 addition & 0 deletions testing/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ set(test_srcs
cuda_timer.cu
cuda_stream.cu
cpu_timer.cu
criterion_exception.cu
criterion_manager.cu
criterion_params.cu
custom_main_custom_args.cu
Expand Down
181 changes: 181 additions & 0 deletions testing/criterion_exception.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include <nvbench/benchmark.cuh>
#include <nvbench/callable.cuh>
#include <nvbench/criterion_manager.cuh>
#include <nvbench/cuda_call.cuh>
#include <nvbench/exec_tag.cuh>
#include <nvbench/launch.cuh>
#include <nvbench/state.cuh>
#include <nvbench/stopping_criterion.cuh>
#include <nvbench/type_list.cuh>
#include <nvbench/types.cuh>

#include <cuda_runtime.h>

#include <fmt/format.h>

#include <stdexcept>
#include <string>

#include "test_asserts.cuh"

// Verifies that an exception thrown by a stopping criterion aborts the
// benchmark (the state is marked as failed) instead of being silently swallowed.

namespace
{

__global__ void spin_kernel(nvbench::uint64_t target_cycles)
{
const auto start = static_cast<nvbench::uint64_t>(clock64());
while (static_cast<nvbench::uint64_t>(clock64()) - start < target_cycles)
{
}
}

constexpr nvbench::uint64_t spin_cycles = 100000;

// Where the criterion should throw from:
enum class throw_site
{
frequency,
measurement,
};

// Shared probe so the test can observe how many times the criterion was
// consulted. If the exception were swallowed and sampling continued, these
// counts would climb well past the single call that throws.
struct criterion_probe
{
throw_site site{throw_site::measurement};
int frequency_calls{0};
int measurement_calls{0};
};

criterion_probe g_probe;

// A stopping criterion that throws on demand. The throw happens on the first
// sample, which should abort the run before `do_is_finished()` is ever
// consulted. If `do_is_finished()` *is* reached, the exception must have been
// swallowed, so it returns true to end the run immediately -- this keeps the
// regression case from spinning until the benchmark timeout.
class throwing_criterion final : public nvbench::stopping_criterion_base
{
public:
throwing_criterion()
: nvbench::stopping_criterion_base{"test_throwing", {}}
{}

protected:
void do_initialize() override {}

void do_add_frequency(nvbench::float32_t /* frequency_hz */) override
{
++g_probe.frequency_calls;
if (g_probe.site == throw_site::frequency)
{
throw std::runtime_error{"criterion failure from add_frequency"};
}
}

void do_add_measurement(nvbench::float64_t /* measurement */) override
{
++g_probe.measurement_calls;
if (g_probe.site == throw_site::measurement)
{
throw std::runtime_error{"criterion failure from add_measurement"};
}
}

bool do_is_finished() override
{
// Only reachable if a sample completed without the throw aborting the run,
// i.e. the exception was swallowed. Finish immediately so the test fails
// fast on the is_skipped() check instead of sampling until the timeout.
return true;
}
};
NVBENCH_REGISTER_CRITERION(throwing_criterion);

struct spin_generator
{
void operator()(nvbench::state &state, nvbench::type_list<>) const
{
state.exec(nvbench::exec_tag::impl::cold, [](nvbench::launch &launch) {
spin_kernel<<<1, 1, 0, launch.get_stream()>>>(spin_cycles);
});
}
};

using benchmark_type = nvbench::benchmark<spin_generator>;

// Runs a benchmark whose criterion throws from `site`, and asserts that the
// benchmark failed (state skipped with the criterion's error) rather than
// completing.
void run_and_expect_failure(throw_site site)
{
g_probe = criterion_probe{};
g_probe.site = site;

benchmark_type bench{spin_generator{}};
bench.add_device(0);
bench.set_stopping_criterion("test_throwing");

// Disable throttle detection. Otherwise the unreliable clock reading of this
// tiny kernel can look like throttling, causing record_measurements() to
// discard the sample before the criterion is ever consulted -- the throw
// would never fire and the run would simply time out.
bench.set_throttle_threshold(0.f);

bench.run();

NVBENCH_CUDA_CALL(cudaDeviceSynchronize());

const auto &states = bench.get_states();
ASSERT(!states.empty());
for (const auto &state : states)
{
ASSERT(state.is_skipped());
ASSERT(state.get_skip_reason().find("criterion failure") != std::string::npos);
}
}

// A throw from `add_measurement` must stop the run after the first sample.
void test_add_measurement_exception_stops_benchmark()
{
run_and_expect_failure(throw_site::measurement);

// The skip check above is what proves the run aborted; this confirms the
// throw happened on the very first measurement,
// and that a frequency measurement was collected as well.
ASSERT(g_probe.frequency_calls == 1);
ASSERT(g_probe.measurement_calls == 1);
}

// A throw from `add_frequency` must stop the run before the measurement for that
// sample is ever recorded.
void test_add_frequency_exception_stops_benchmark()
{
run_and_expect_failure(throw_site::frequency);

ASSERT(g_probe.frequency_calls == 1);
ASSERT(g_probe.measurement_calls == 0);
}

} // namespace

int main()
try
{
test_add_measurement_exception_stops_benchmark();
test_add_frequency_exception_stops_benchmark();

return 0;
}
catch (std::exception &e)
{
fmt::print("{}\n", e.what());
return 1;
}