cppalliance · mborland · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026
diff --git a/doc/modules/ROOT/pages/cuda.adoc b/doc/modules/ROOT/pages/cuda.adoc
@@ -290,3 +290,8 @@ Be aware that other threads in the kernel may continue executing with incorrect
 }
 ----
 
+== Examples
+
+* xref:examples.adoc#examples_cuda[CUDA Device Support] — demonstrates that all safe_numbers types and free functions work on a CUDA device.
+* xref:examples.adoc#examples_cuda_error_handling[CUDA Error Handling] — shows how to use `device_error_context` to catch device-side overflow on the host and recover gracefully.
+* xref:examples.adoc#examples_cuda_error_handling_without_error_context[CUDA Error Handling Without Error Context] — demonstrates what happens when an overflow occurs on the device *without* `device_error_context`: the CUDA context is irrecoverably corrupted and no further kernels can be launched.
diff --git a/doc/modules/ROOT/pages/examples.adoc b/doc/modules/ROOT/pages/examples.adoc
@@ -608,3 +608,29 @@ result[2] = 31
 result[3] = 41
 ----
 ====
+
+[#examples_cuda_error_handling_without_error_context]
+== CUDA Error Handling Without Error Context
+
+This example demonstrates what happens when a safe_numbers overflow occurs on a CUDA device *without* using `device_error_context`.
+The overflow triggers a device-side trap that corrupts the CUDA context, making `cudaDeviceSynchronize()` report an unspecified launch failure.
+Even after calling `cudaDeviceReset()`, the process cannot launch any further kernels — all subsequent CUDA operations fail.
+This motivates the use of `device_error_context` shown in the previous example.
+
+.This https://github.com/boostorg/safe_numbers/blob/develop/examples/cuda_error_handling_without_error_context.cu[example] demonstrates the consequences of not using `device_error_context` for GPU error handling.
+====
+[source, c++]
+----
+include::example$cuda_error_handling_without_error_context.cu[]
+----
+
+Output:
+----
+=== Launching kernel that overflows ===
+Kernel failed with error: unspecified launch failure
+Kernel has been reset via CUDA API
+
+=== Launching kernel with valid arithmetic ===
+Kernel failed with error: CUDA-capable device(s) is/are busy or unavailable
+----
+====
diff --git a/examples/cuda_error_handling_without_error_context.cu b/examples/cuda_error_handling_without_error_context.cu
@@ -0,0 +1,137 @@
+// Copyright 2026 Matt Borland
+// Distributed under the Boost Software License, Version 1.0.
+// https://www.boost.org/LICENSE_1_0.txt
+
+// This example demonstrates how to catch arithmetic errors that occur
+// on a CUDA device using device_error_context. When a safe_numbers
+// operation overflows on the GPU, the error is captured in managed
+// memory and rethrown with BOOST_THROW_EXCEPTION on the host when
+// you call ctx.synchronize().
+//
+// The device_error_context manages a dynamically allocated managed
+// memory buffer. When an error is detected, synchronize() clears the
+// error state and throws. After catching the exception, the same
+// context can be reused immediately for new kernel launches.
+
+#include <iostream>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/cuda_error_reporting.hpp>
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+// This kernel deliberately overflows: it adds 1 to the maximum u32 value
+__global__ void overflow_kernel(test_type* out)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i == 0)
+    {
+        const test_type max_val {(std::numeric_limits<basis_type>::max)()};
+        out[0] = max_val + test_type{1};  // Overflow!
+    }
+}
+
+// This kernel performs valid arithmetic
+__global__ void safe_kernel(const test_type* in, test_type* out, int n)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < n)
+    {
+        out[i] = in[i] + test_type{1};
+    }
+}
+
+int main()
+{
+    // ---------------------------------------------------------------
+    // Step 1: Launch a kernel that overflows
+    //
+    // Instead of using the booost::safe_numbers::cuda_error_context
+    // we will use standard CUDA machinery in case the user forgets
+    // or does not want to use it.
+    // ---------------------------------------------------------------
+
+    test_type* result = nullptr;
+    cudaMallocManaged(&result, sizeof(test_type));
+    cudaDeviceSynchronize();
+
+    std::cout << "=== Launching kernel that overflows ===" << std::endl;
+
+    overflow_kernel<<<1, 1>>>(result);
+    auto status = cudaDeviceSynchronize();
+
+    // The default should have trapped the kernel, leading to failure conditions
+    // Attempt to reset the device (which we know fails)
+    if (status != cudaSuccess)
+    {
+        const auto error = cudaGetLastError();
+        std::cerr << "Kernel failed with error: " << cudaGetErrorString(error) << std::endl;
+        cudaDeviceReset();
+        std::cerr << "Kernel has been reset via CUDA API" << std::endl;
+    }
+
+    // ---------------------------------------------------------------
+    // Step 2: Try to launch another kernel from the same process
+    //
+    // Since we have trapped the CUDA device in the background,
+    // it is unable to be launched again from the same process.
+    // This error cannot be cleared even though we called cudaDeviceReset()
+    // ---------------------------------------------------------------
+
+    std::cout << "\n=== Launching kernel with valid arithmetic ===" << std::endl;
+
+    test_type* data = nullptr;
+    test_type* out = nullptr;
+
+    cudaMallocManaged(&data, 4 * sizeof(test_type));
+    cudaMallocManaged(&out, 4 * sizeof(test_type));
+    status = cudaDeviceSynchronize();
+
+    if (status != cudaSuccess)
+    {
+        const auto error = cudaGetLastError();
+        std::cerr << "Kernel failed with error: " << cudaGetErrorString(error) << std::endl;
+    }
+    else
+    {
+        // If we had not checked status, this following code would have terminated with SegFault at data[0]
+
+        data[0] = test_type{10};
+        data[1] = test_type{20};
+        data[2] = test_type{30};
+        data[3] = test_type{40};
+
+        safe_kernel<<<1, 4>>>(data, out, 4);
+        status = cudaDeviceSynchronize();
+
+        if (status != cudaSuccess)
+        {
+            const auto error = cudaGetLastError();
+            std::cerr << "Kernel failed with error: " << cudaGetErrorString(error) << std::endl;
+        }
+    }
+
+    // ---------------------------------------------------------------
+    // Cleanup
+    // ---------------------------------------------------------------
+
+    if (result != nullptr)
+    {
+        cudaFree(result);
+    }
+    if (data != nullptr)
+    {
+        cudaFree(data);
+    }
+    if (out != nullptr)
+    {
+        cudaFree(out);
+    }
+
+    return 0;
+}
diff --git a/test/cuda_jamfile b/test/cuda_jamfile
@@ -312,3 +312,4 @@ run test_cuda_u128_midpoint.cu ;
 # Examples
 run ../examples/cuda.cu ;
 run ../examples/cuda_error_handling.cu ;
+run ../examples/cuda_error_handling_without_error_context.cu ;