From 67ff75a0ada663db136f18733210a0a5fcb0f082 Mon Sep 17 00:00:00 2001
From: Marco Franzreb Salgado <mfranzrebsal@nvidia.com>
Date: Thu, 28 May 2026 09:51:57 +0200
Subject: [PATCH 1/2] Add frequency measurement to the stopping criterion

---
 nvbench/detail/measure_cold.cu |  2 ++
 nvbench/stopping_criterion.cuh | 13 +++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu
index c0851b81..60d92488 100644
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -147,6 +147,8 @@ void measure_cold_base::record_measurements()
 
     m_sm_clock_rates.push_back(current_clock_rate);
     m_sm_clock_rate_accumulator += current_clock_rate;
+
+    m_stopping_criterion.add_frequency(current_clock_rate);
   }
 
   // Update and record timers and counters:
diff --git a/nvbench/stopping_criterion.cuh b/nvbench/stopping_criterion.cuh
index 43bec172..73b1402b 100644
--- a/nvbench/stopping_criterion.cuh
+++ b/nvbench/stopping_criterion.cuh
@@ -109,6 +109,14 @@ public:
     this->do_initialize();
   }
 
+  /**
+   * Provide the GPU clock frequency (Hz) observed for the current sample. It is not called when
+   * doing CPU-only benchmarking (i.e.: using `nvbench::exec_tag::cpu_only` or
+   * `nvbench::exec_tag::no_gpu`) or profiling (`--profile` option). When called, it is done before
+   * calling `add_measurement` for the same sample.
+   */
+  void add_frequency(nvbench::float32_t frequency_hz) { this->do_add_frequency(frequency_hz); }
+
   /**
    * Add the latest measurement to the criterion
    */
@@ -134,6 +142,11 @@ protected:
    * Check if the criterion has been met for all measurements processed by `add_measurement`
    */
   virtual bool do_is_finished() = 0;
+
+  /**
+   * Receive the GPU clock frequency (Hz) for the current sample. Default no-op.
+   */
+  virtual void do_add_frequency(nvbench::float32_t /*frequency_hz*/) {}
 };
 
 } // namespace nvbench

From 90ac65a8dd7801582dce74d330ab7b022e26a3ac Mon Sep 17 00:00:00 2001
From: Marco Franzreb Salgado <mfranzrebsal@nvidia.com>
Date: Thu, 4 Jun 2026 18:00:31 +0200
Subject: [PATCH 2/2] Frequency measurement improvements

Add an example that shows usage of a custom criterion with frequency measurement

Add a test that ensures throwing from "add_measurement" and "add_frequency" works as expected.
---
 examples/CMakeLists.txt         |  11 +-
 examples/frequency_criterion.cu | 120 +++++++++++++++++++++
 testing/CMakeLists.txt          |   1 +
 testing/criterion_exception.cu  | 181 ++++++++++++++++++++++++++++++++
 4 files changed, 311 insertions(+), 2 deletions(-)
 create mode 100644 examples/frequency_criterion.cu
 create mode 100644 testing/criterion_exception.cu

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 2abe3c7d..11b64091 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -6,12 +6,19 @@ set(example_srcs
   enums.cu
   exec_tag_sync.cu
   exec_tag_timer.cu
+  frequency_criterion.cu
   skip.cu
   stream.cu
   summaries.cu
   throughput.cu
 )
 
+# These examples use custom stopping criteria that don't support the --min-time argument:
+set(no_min_time_examples
+  custom_criterion.cu
+  frequency_criterion.cu
+)
+
 # Metatarget for all examples:
 add_custom_target(nvbench.example.all)
 add_dependencies(nvbench.all nvbench.example.all)
@@ -30,8 +37,8 @@ function (nvbench_add_examples_target target_prefix cuda_std)
     set_target_properties(${example_name} PROPERTIES COMPILE_FEATURES cuda_std_${cuda_std})
 
     set(example_args --timeout 0.1)
-    # The custom_criterion example doesn't support the --min-time argument:
-    if (NOT "${example_src}" STREQUAL "custom_criterion.cu")
+    # Some examples use custom criteria that don't support the --min-time argument:
+    if (NOT "${example_src}" IN_LIST no_min_time_examples)
       list(APPEND example_args --min-time 1e-5)
     endif()
 
diff --git a/examples/frequency_criterion.cu b/examples/frequency_criterion.cu
new file mode 100644
index 00000000..df878e63
--- /dev/null
+++ b/examples/frequency_criterion.cu
@@ -0,0 +1,120 @@
+/*
+ *  Copyright 2026 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/nvbench.cuh>
+
+// Grab some testing kernels from NVBench:
+#include <nvbench/test_kernels.cuh>
+
+// Thrust vectors simplify memory management:
+#include <thrust/device_vector.h>
+
+#include <stdexcept>
+
+// This example shows how to write a stopping criterion that *requires* the GPU
+// clock frequency that NVBench observes for every cold-measurement sample.
+//
+// In addition to `do_add_measurement()`, a criterion may override
+// `do_add_frequency()` to receive the SM clock rate (in Hz) measured during the
+// sample. NVBench calls `add_frequency()` immediately before `add_measurement()`
+// for the same sample -- but only when it can measure the clock. It is NOT
+// called while profiling (the `--profile` option) or for CPU-only benchmarks
+// (`nvbench::exec_tag::cpu_only` / `no_gpu`).
+//
+// Like the `fixed` criterion in `custom_criterion.cu`, this one simply runs for
+// a fixed number of samples. The difference is that it also collects the
+// per-sample frequency and throws if a sample arrives without one. The thrown
+// exception is caught per-benchmark by NVBench and reported as a failure, so
+// running this benchmark with `--profile` produces a clear error instead of
+// silently ignoring the missing frequency.
+
+// Inherit from the stopping_criterion_base class:
+class frequency_criterion final : public nvbench::stopping_criterion_base
+{
+  nvbench::int64_t m_num_samples{};
+  bool m_has_frequency{false};
+
+public:
+  frequency_criterion()
+      : nvbench::stopping_criterion_base{"frequency", {{"max-samples", nvbench::int64_t{42}}}}
+  {}
+
+protected:
+  // Setup the criterion in the `do_initialize()` method:
+  virtual void do_initialize() override
+  {
+    m_num_samples   = 0;
+    m_has_frequency = false;
+  }
+
+  // Collect the GPU clock frequency for the current sample. NVBench calls this
+  // before `do_add_measurement()` whenever a frequency is available:
+  virtual void do_add_frequency(nvbench::float32_t /* frequency_hz */) override
+  {
+    m_has_frequency = true;
+  }
+
+  // Process new measurements in the `do_add_measurement()` method:
+  virtual void do_add_measurement(nvbench::float64_t /* measurement */) override
+  {
+    // This criterion requires a frequency for every sample. NVBench calls
+    // `do_add_frequency()` before `do_add_measurement()` when one is available,
+    // so a missing frequency here means none was provided for this sample:
+    if (!m_has_frequency)
+    {
+      throw std::runtime_error(
+        "frequency_criterion requires a GPU clock frequency for every sample, but none was "
+        "provided. NVBench does not measure the clock frequency when profiling (--profile) or for "
+        "CPU-only benchmarks (nvbench::exec_tag::cpu_only / no_gpu).");
+    }
+
+    m_has_frequency = false; // consume it; the next sample must provide its own
+    m_num_samples++;
+  }
+
+  // Check if the stopping criterion is met in the `do_is_finished()` method:
+  virtual bool do_is_finished() override
+  {
+    return m_num_samples >= m_params.get_int64("max-samples");
+  }
+};
+
+// Register the criterion with NVBench:
+NVBENCH_REGISTER_CRITERION(frequency_criterion);
+
+void throughput_bench(nvbench::state &state)
+{
+  // Allocate input data:
+  const std::size_t num_values = 64 * 1024 * 1024 / sizeof(nvbench::int32_t);
+  thrust::device_vector<nvbench::int32_t> input(num_values);
+  thrust::device_vector<nvbench::int32_t> output(num_values);
+
+  // Provide throughput information:
+  state.add_element_count(num_values, "NumElements");
+  state.add_global_memory_reads<nvbench::int32_t>(num_values, "DataSize");
+  state.add_global_memory_writes<nvbench::int32_t>(num_values);
+
+  state.exec([&input, &output, num_values](nvbench::launch &launch) {
+    (void)num_values; // clang thinks this is unused...
+    nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(
+      thrust::raw_pointer_cast(input.data()),
+      thrust::raw_pointer_cast(output.data()),
+      num_values);
+  });
+}
+NVBENCH_BENCH(throughput_bench).set_stopping_criterion("frequency");
diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt
index 7dc6cef9..df457dc0 100644
--- a/testing/CMakeLists.txt
+++ b/testing/CMakeLists.txt
@@ -6,6 +6,7 @@ set(test_srcs
   cuda_timer.cu
   cuda_stream.cu
   cpu_timer.cu
+  criterion_exception.cu
   criterion_manager.cu
   criterion_params.cu
   custom_main_custom_args.cu
diff --git a/testing/criterion_exception.cu b/testing/criterion_exception.cu
new file mode 100644
index 00000000..aac365ab
--- /dev/null
+++ b/testing/criterion_exception.cu
@@ -0,0 +1,181 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <nvbench/benchmark.cuh>
+#include <nvbench/callable.cuh>
+#include <nvbench/criterion_manager.cuh>
+#include <nvbench/cuda_call.cuh>
+#include <nvbench/exec_tag.cuh>
+#include <nvbench/launch.cuh>
+#include <nvbench/state.cuh>
+#include <nvbench/stopping_criterion.cuh>
+#include <nvbench/type_list.cuh>
+#include <nvbench/types.cuh>
+
+#include <cuda_runtime.h>
+
+#include <fmt/format.h>
+
+#include <stdexcept>
+#include <string>
+
+#include "test_asserts.cuh"
+
+// Verifies that an exception thrown by a stopping criterion aborts the
+// benchmark (the state is marked as failed) instead of being silently swallowed.
+
+namespace
+{
+
+__global__ void spin_kernel(nvbench::uint64_t target_cycles)
+{
+  const auto start = static_cast<nvbench::uint64_t>(clock64());
+  while (static_cast<nvbench::uint64_t>(clock64()) - start < target_cycles)
+  {
+  }
+}
+
+constexpr nvbench::uint64_t spin_cycles = 100000;
+
+// Where the criterion should throw from:
+enum class throw_site
+{
+  frequency,
+  measurement,
+};
+
+// Shared probe so the test can observe how many times the criterion was
+// consulted. If the exception were swallowed and sampling continued, these
+// counts would climb well past the single call that throws.
+struct criterion_probe
+{
+  throw_site site{throw_site::measurement};
+  int frequency_calls{0};
+  int measurement_calls{0};
+};
+
+criterion_probe g_probe;
+
+// A stopping criterion that throws on demand. The throw happens on the first
+// sample, which should abort the run before `do_is_finished()` is ever
+// consulted. If `do_is_finished()` *is* reached, the exception must have been
+// swallowed, so it returns true to end the run immediately -- this keeps the
+// regression case from spinning until the benchmark timeout.
+class throwing_criterion final : public nvbench::stopping_criterion_base
+{
+public:
+  throwing_criterion()
+      : nvbench::stopping_criterion_base{"test_throwing", {}}
+  {}
+
+protected:
+  void do_initialize() override {}
+
+  void do_add_frequency(nvbench::float32_t /* frequency_hz */) override
+  {
+    ++g_probe.frequency_calls;
+    if (g_probe.site == throw_site::frequency)
+    {
+      throw std::runtime_error{"criterion failure from add_frequency"};
+    }
+  }
+
+  void do_add_measurement(nvbench::float64_t /* measurement */) override
+  {
+    ++g_probe.measurement_calls;
+    if (g_probe.site == throw_site::measurement)
+    {
+      throw std::runtime_error{"criterion failure from add_measurement"};
+    }
+  }
+
+  bool do_is_finished() override
+  {
+    // Only reachable if a sample completed without the throw aborting the run,
+    // i.e. the exception was swallowed. Finish immediately so the test fails
+    // fast on the is_skipped() check instead of sampling until the timeout.
+    return true;
+  }
+};
+NVBENCH_REGISTER_CRITERION(throwing_criterion);
+
+struct spin_generator
+{
+  void operator()(nvbench::state &state, nvbench::type_list<>) const
+  {
+    state.exec(nvbench::exec_tag::impl::cold, [](nvbench::launch &launch) {
+      spin_kernel<<<1, 1, 0, launch.get_stream()>>>(spin_cycles);
+    });
+  }
+};
+
+using benchmark_type = nvbench::benchmark<spin_generator>;
+
+// Runs a benchmark whose criterion throws from `site`, and asserts that the
+// benchmark failed (state skipped with the criterion's error) rather than
+// completing.
+void run_and_expect_failure(throw_site site)
+{
+  g_probe      = criterion_probe{};
+  g_probe.site = site;
+
+  benchmark_type bench{spin_generator{}};
+  bench.add_device(0);
+  bench.set_stopping_criterion("test_throwing");
+
+  // Disable throttle detection. Otherwise the unreliable clock reading of this
+  // tiny kernel can look like throttling, causing record_measurements() to
+  // discard the sample before the criterion is ever consulted -- the throw
+  // would never fire and the run would simply time out.
+  bench.set_throttle_threshold(0.f);
+
+  bench.run();
+
+  NVBENCH_CUDA_CALL(cudaDeviceSynchronize());
+
+  const auto &states = bench.get_states();
+  ASSERT(!states.empty());
+  for (const auto &state : states)
+  {
+    ASSERT(state.is_skipped());
+    ASSERT(state.get_skip_reason().find("criterion failure") != std::string::npos);
+  }
+}
+
+// A throw from `add_measurement` must stop the run after the first sample.
+void test_add_measurement_exception_stops_benchmark()
+{
+  run_and_expect_failure(throw_site::measurement);
+
+  // The skip check above is what proves the run aborted; this confirms the
+  // throw happened on the very first measurement,
+  // and that a frequency measurement was collected as well.
+  ASSERT(g_probe.frequency_calls == 1);
+  ASSERT(g_probe.measurement_calls == 1);
+}
+
+// A throw from `add_frequency` must stop the run before the measurement for that
+// sample is ever recorded.
+void test_add_frequency_exception_stops_benchmark()
+{
+  run_and_expect_failure(throw_site::frequency);
+
+  ASSERT(g_probe.frequency_calls == 1);
+  ASSERT(g_probe.measurement_calls == 0);
+}
+
+} // namespace
+
+int main()
+try
+{
+  test_add_measurement_exception_stops_benchmark();
+  test_add_frequency_exception_stops_benchmark();
+
+  return 0;
+}
+catch (std::exception &e)
+{
+  fmt::print("{}\n", e.what());
+  return 1;
+}