From 67ff75a0ada663db136f18733210a0a5fcb0f082 Mon Sep 17 00:00:00 2001 From: Marco Franzreb Salgado Date: Thu, 28 May 2026 09:51:57 +0200 Subject: [PATCH 1/2] Add frequency measurement to the stopping criterion --- nvbench/detail/measure_cold.cu | 2 ++ nvbench/stopping_criterion.cuh | 13 +++++++++++++ 2 files changed, 15 insertions(+) diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu index c0851b81..60d92488 100644 --- a/nvbench/detail/measure_cold.cu +++ b/nvbench/detail/measure_cold.cu @@ -147,6 +147,8 @@ void measure_cold_base::record_measurements() m_sm_clock_rates.push_back(current_clock_rate); m_sm_clock_rate_accumulator += current_clock_rate; + + m_stopping_criterion.add_frequency(current_clock_rate); } // Update and record timers and counters: diff --git a/nvbench/stopping_criterion.cuh b/nvbench/stopping_criterion.cuh index 43bec172..73b1402b 100644 --- a/nvbench/stopping_criterion.cuh +++ b/nvbench/stopping_criterion.cuh @@ -109,6 +109,14 @@ public: this->do_initialize(); } + /** + * Provide the GPU clock frequency (Hz) observed for the current sample. It is not called when + * doing CPU-only benchmarking (i.e.: using `nvbench::exec_tag::cpu_only` or + * `nvbench::exec_tag::no_gpu`) or profiling (`--profile` option). When called, it is done before + * calling `add_measurement` for the same sample. + */ + void add_frequency(nvbench::float32_t frequency_hz) { this->do_add_frequency(frequency_hz); } + /** * Add the latest measurement to the criterion */ @@ -134,6 +142,11 @@ protected: * Check if the criterion has been met for all measurements processed by `add_measurement` */ virtual bool do_is_finished() = 0; + + /** + * Receive the GPU clock frequency (Hz) for the current sample. Default no-op. + */ + virtual void do_add_frequency(nvbench::float32_t /*frequency_hz*/) {} }; } // namespace nvbench From 90ac65a8dd7801582dce74d330ab7b022e26a3ac Mon Sep 17 00:00:00 2001 From: Marco Franzreb Salgado Date: Thu, 4 Jun 2026 18:00:31 +0200 Subject: [PATCH 2/2] Frequency measurement improvements Add an example that shows usage of a custom criterion with frequency measurement Add a test that ensures throwing from "add_measurement" and "add_frequency" works as expected. --- examples/CMakeLists.txt | 11 +- examples/frequency_criterion.cu | 120 +++++++++++++++++++++ testing/CMakeLists.txt | 1 + testing/criterion_exception.cu | 181 ++++++++++++++++++++++++++++++++ 4 files changed, 311 insertions(+), 2 deletions(-) create mode 100644 examples/frequency_criterion.cu create mode 100644 testing/criterion_exception.cu diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 2abe3c7d..11b64091 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -6,12 +6,19 @@ set(example_srcs enums.cu exec_tag_sync.cu exec_tag_timer.cu + frequency_criterion.cu skip.cu stream.cu summaries.cu throughput.cu ) +# These examples use custom stopping criteria that don't support the --min-time argument: +set(no_min_time_examples + custom_criterion.cu + frequency_criterion.cu +) + # Metatarget for all examples: add_custom_target(nvbench.example.all) add_dependencies(nvbench.all nvbench.example.all) @@ -30,8 +37,8 @@ function (nvbench_add_examples_target target_prefix cuda_std) set_target_properties(${example_name} PROPERTIES COMPILE_FEATURES cuda_std_${cuda_std}) set(example_args --timeout 0.1) - # The custom_criterion example doesn't support the --min-time argument: - if (NOT "${example_src}" STREQUAL "custom_criterion.cu") + # Some examples use custom criteria that don't support the --min-time argument: + if (NOT "${example_src}" IN_LIST no_min_time_examples) list(APPEND example_args --min-time 1e-5) endif() diff --git a/examples/frequency_criterion.cu b/examples/frequency_criterion.cu new file mode 100644 index 00000000..df878e63 --- /dev/null +++ b/examples/frequency_criterion.cu @@ -0,0 +1,120 @@ +/* + * Copyright 2026 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 with the LLVM exception + * (the "License"); you may not use this file except in compliance with + * the License. + * + * You may obtain a copy of the License at + * + * http://llvm.org/foundation/relicensing/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +// Grab some testing kernels from NVBench: +#include + +// Thrust vectors simplify memory management: +#include + +#include + +// This example shows how to write a stopping criterion that *requires* the GPU +// clock frequency that NVBench observes for every cold-measurement sample. +// +// In addition to `do_add_measurement()`, a criterion may override +// `do_add_frequency()` to receive the SM clock rate (in Hz) measured during the +// sample. NVBench calls `add_frequency()` immediately before `add_measurement()` +// for the same sample -- but only when it can measure the clock. It is NOT +// called while profiling (the `--profile` option) or for CPU-only benchmarks +// (`nvbench::exec_tag::cpu_only` / `no_gpu`). +// +// Like the `fixed` criterion in `custom_criterion.cu`, this one simply runs for +// a fixed number of samples. The difference is that it also collects the +// per-sample frequency and throws if a sample arrives without one. The thrown +// exception is caught per-benchmark by NVBench and reported as a failure, so +// running this benchmark with `--profile` produces a clear error instead of +// silently ignoring the missing frequency. + +// Inherit from the stopping_criterion_base class: +class frequency_criterion final : public nvbench::stopping_criterion_base +{ + nvbench::int64_t m_num_samples{}; + bool m_has_frequency{false}; + +public: + frequency_criterion() + : nvbench::stopping_criterion_base{"frequency", {{"max-samples", nvbench::int64_t{42}}}} + {} + +protected: + // Setup the criterion in the `do_initialize()` method: + virtual void do_initialize() override + { + m_num_samples = 0; + m_has_frequency = false; + } + + // Collect the GPU clock frequency for the current sample. NVBench calls this + // before `do_add_measurement()` whenever a frequency is available: + virtual void do_add_frequency(nvbench::float32_t /* frequency_hz */) override + { + m_has_frequency = true; + } + + // Process new measurements in the `do_add_measurement()` method: + virtual void do_add_measurement(nvbench::float64_t /* measurement */) override + { + // This criterion requires a frequency for every sample. NVBench calls + // `do_add_frequency()` before `do_add_measurement()` when one is available, + // so a missing frequency here means none was provided for this sample: + if (!m_has_frequency) + { + throw std::runtime_error( + "frequency_criterion requires a GPU clock frequency for every sample, but none was " + "provided. NVBench does not measure the clock frequency when profiling (--profile) or for " + "CPU-only benchmarks (nvbench::exec_tag::cpu_only / no_gpu)."); + } + + m_has_frequency = false; // consume it; the next sample must provide its own + m_num_samples++; + } + + // Check if the stopping criterion is met in the `do_is_finished()` method: + virtual bool do_is_finished() override + { + return m_num_samples >= m_params.get_int64("max-samples"); + } +}; + +// Register the criterion with NVBench: +NVBENCH_REGISTER_CRITERION(frequency_criterion); + +void throughput_bench(nvbench::state &state) +{ + // Allocate input data: + const std::size_t num_values = 64 * 1024 * 1024 / sizeof(nvbench::int32_t); + thrust::device_vector input(num_values); + thrust::device_vector output(num_values); + + // Provide throughput information: + state.add_element_count(num_values, "NumElements"); + state.add_global_memory_reads(num_values, "DataSize"); + state.add_global_memory_writes(num_values); + + state.exec([&input, &output, num_values](nvbench::launch &launch) { + (void)num_values; // clang thinks this is unused... + nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>( + thrust::raw_pointer_cast(input.data()), + thrust::raw_pointer_cast(output.data()), + num_values); + }); +} +NVBENCH_BENCH(throughput_bench).set_stopping_criterion("frequency"); diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt index 7dc6cef9..df457dc0 100644 --- a/testing/CMakeLists.txt +++ b/testing/CMakeLists.txt @@ -6,6 +6,7 @@ set(test_srcs cuda_timer.cu cuda_stream.cu cpu_timer.cu + criterion_exception.cu criterion_manager.cu criterion_params.cu custom_main_custom_args.cu diff --git a/testing/criterion_exception.cu b/testing/criterion_exception.cu new file mode 100644 index 00000000..aac365ab --- /dev/null +++ b/testing/criterion_exception.cu @@ -0,0 +1,181 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include + +#include "test_asserts.cuh" + +// Verifies that an exception thrown by a stopping criterion aborts the +// benchmark (the state is marked as failed) instead of being silently swallowed. + +namespace +{ + +__global__ void spin_kernel(nvbench::uint64_t target_cycles) +{ + const auto start = static_cast(clock64()); + while (static_cast(clock64()) - start < target_cycles) + { + } +} + +constexpr nvbench::uint64_t spin_cycles = 100000; + +// Where the criterion should throw from: +enum class throw_site +{ + frequency, + measurement, +}; + +// Shared probe so the test can observe how many times the criterion was +// consulted. If the exception were swallowed and sampling continued, these +// counts would climb well past the single call that throws. +struct criterion_probe +{ + throw_site site{throw_site::measurement}; + int frequency_calls{0}; + int measurement_calls{0}; +}; + +criterion_probe g_probe; + +// A stopping criterion that throws on demand. The throw happens on the first +// sample, which should abort the run before `do_is_finished()` is ever +// consulted. If `do_is_finished()` *is* reached, the exception must have been +// swallowed, so it returns true to end the run immediately -- this keeps the +// regression case from spinning until the benchmark timeout. +class throwing_criterion final : public nvbench::stopping_criterion_base +{ +public: + throwing_criterion() + : nvbench::stopping_criterion_base{"test_throwing", {}} + {} + +protected: + void do_initialize() override {} + + void do_add_frequency(nvbench::float32_t /* frequency_hz */) override + { + ++g_probe.frequency_calls; + if (g_probe.site == throw_site::frequency) + { + throw std::runtime_error{"criterion failure from add_frequency"}; + } + } + + void do_add_measurement(nvbench::float64_t /* measurement */) override + { + ++g_probe.measurement_calls; + if (g_probe.site == throw_site::measurement) + { + throw std::runtime_error{"criterion failure from add_measurement"}; + } + } + + bool do_is_finished() override + { + // Only reachable if a sample completed without the throw aborting the run, + // i.e. the exception was swallowed. Finish immediately so the test fails + // fast on the is_skipped() check instead of sampling until the timeout. + return true; + } +}; +NVBENCH_REGISTER_CRITERION(throwing_criterion); + +struct spin_generator +{ + void operator()(nvbench::state &state, nvbench::type_list<>) const + { + state.exec(nvbench::exec_tag::impl::cold, [](nvbench::launch &launch) { + spin_kernel<<<1, 1, 0, launch.get_stream()>>>(spin_cycles); + }); + } +}; + +using benchmark_type = nvbench::benchmark; + +// Runs a benchmark whose criterion throws from `site`, and asserts that the +// benchmark failed (state skipped with the criterion's error) rather than +// completing. +void run_and_expect_failure(throw_site site) +{ + g_probe = criterion_probe{}; + g_probe.site = site; + + benchmark_type bench{spin_generator{}}; + bench.add_device(0); + bench.set_stopping_criterion("test_throwing"); + + // Disable throttle detection. Otherwise the unreliable clock reading of this + // tiny kernel can look like throttling, causing record_measurements() to + // discard the sample before the criterion is ever consulted -- the throw + // would never fire and the run would simply time out. + bench.set_throttle_threshold(0.f); + + bench.run(); + + NVBENCH_CUDA_CALL(cudaDeviceSynchronize()); + + const auto &states = bench.get_states(); + ASSERT(!states.empty()); + for (const auto &state : states) + { + ASSERT(state.is_skipped()); + ASSERT(state.get_skip_reason().find("criterion failure") != std::string::npos); + } +} + +// A throw from `add_measurement` must stop the run after the first sample. +void test_add_measurement_exception_stops_benchmark() +{ + run_and_expect_failure(throw_site::measurement); + + // The skip check above is what proves the run aborted; this confirms the + // throw happened on the very first measurement, + // and that a frequency measurement was collected as well. + ASSERT(g_probe.frequency_calls == 1); + ASSERT(g_probe.measurement_calls == 1); +} + +// A throw from `add_frequency` must stop the run before the measurement for that +// sample is ever recorded. +void test_add_frequency_exception_stops_benchmark() +{ + run_and_expect_failure(throw_site::frequency); + + ASSERT(g_probe.frequency_calls == 1); + ASSERT(g_probe.measurement_calls == 0); +} + +} // namespace + +int main() +try +{ + test_add_measurement_exception_stops_benchmark(); + test_add_frequency_exception_stops_benchmark(); + + return 0; +} +catch (std::exception &e) +{ + fmt::print("{}\n", e.what()); + return 1; +}