cppalliance · mborland · Apr 7, 2026 · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026
diff --git a/doc/modules/ROOT/pages/api_reference.adoc b/doc/modules/ROOT/pages/api_reference.adoc
@@ -61,16 +61,38 @@ https://www.boost.org/LICENSE_1_0.txt
 
 | xref:policies.adoc[`overflow_policy`]
 | Enum class specifying the overflow handling policy for arithmetic operations
+
+| xref:cuda.adoc#cuda_device_exception_mode[`device_exception_mode`]
+| Enum class controlling whether CUDA device errors trap the kernel or defer to the host
 |===
 
-=== CUDA Specific Handling
+=== CUDA Support
 
 [cols="1,2", options="header"]
 |===
 | Type | Description
 
-| xref:cuda.adoc[`device_error_context`]
-| CUDA specific device error context replacing `cudaDeviceSynchronize` and `cudaGetLastError`
+| xref:cuda.adoc#cuda_device_error_context[`device_error_context`]
+| CUDA device error context replacing `cudaDeviceSynchronize` and `cudaGetLastError`
+
+| xref:cuda.adoc#cuda_device_exception_mode[`device_exception_mode`]
+| Enum selecting trapped (immediate `__trap()`) or untrapped (deferred host exception) error handling
+|===
+
+==== `device_error_context` Member Functions
+
+[cols="1,2", options="header"]
+|===
+| Function | Description
+
+| xref:cuda.adoc#cuda_device_error_context_reset[`reset`]
+| Clears the error state so the context can be reused across kernel launches
+
+| xref:cuda.adoc#cuda_device_error_context_set_mode[`set_device_exception_method`]
+| Changes the device exception mode after construction
+
+| xref:cuda.adoc#cuda_device_error_context_synchronize[`synchronize`]
+| Synchronizes the device, checks for captured errors, and throws the corresponding host exception
 |===
 
 [#api_functions]
@@ -337,4 +359,7 @@ This header is not included in the convenience header since it requires external
 
 | `<boost/safe_numbers/byte_conversions.hpp>`
 | Byte order conversion functions (`to_be`, `from_be`, `to_le`, `from_le`, `to_be_bytes`, `from_be_bytes`, `to_le_bytes`, `from_le_bytes`, `to_ne_bytes`, `from_ne_bytes`)
+
+| `<boost/safe_numbers/cuda_error_reporting.hpp>`
+| CUDA device error handling (`device_exception_mode`, `device_error_context`)
 |===
diff --git a/doc/modules/ROOT/pages/cuda.adoc b/doc/modules/ROOT/pages/cuda.adoc
@@ -45,7 +45,7 @@ int main()
 }
 ----
 
-For the on-device computation behavior to match the CPU computation behavior, we have our own error context class
+For the on-device computation behavior to match the CPU computation behavior, we have our own error context class.
 This reduces our above example to the following:
 
 [source, c++]
@@ -92,5 +92,201 @@ Device error on thread 256 at /home/runner/work/safe_numbers/boost-root/libs/saf
 
 The `device_error_context` will also attempt to `printf` the error into the terminal.
 This works when compiling with verbose mode `-V`.
-`printf` error messages will look the same as the message displayed by the thrown exception
+`printf` error messages will look the same as the message displayed by the thrown exception.
+
+[#cuda_device_exception_mode]
+== The `device_exception_mode` Enum
+
+[source,c++]
+----
+#include <boost/safe_numbers/cuda_error_reporting.hpp>
+
+namespace boost::safe_numbers {
+
+enum class device_exception_mode : unsigned
+{
+    trapped,
+    untrapped,
+};
+
+inline constexpr auto trapped = device_exception_mode::trapped;
+inline constexpr auto untrapped = device_exception_mode::untrapped;
+
+} // namespace boost::safe_numbers
+----
+
+This enum controls what happens when a safe_numbers operation detects an error on the CUDA device.
+
+|===
+| Mode | Behavior
+
+| `trapped`
+| Calls `__trap()` on the device, which immediately terminates the kernel.
+This is a *sticky, unrecoverable error*: the CUDA context is corrupted and the entire host process must be terminated in order to reuse the device.
+All other threads in the kernel spin until `__trap()` takes effect.
+This is the default mode because it guarantees a hard failure that cannot be silently ignored.
+
+| `untrapped`
+| Records the error in managed memory and returns without calling `__trap()`.
+The kernel completes normally — other threads may continue executing with potentially incorrect values.
+The error is detected on the host when `synchronize()` is called, which throws the appropriate exception.
+This mode preserves the CUDA context, allowing the `device_error_context` to be reused for subsequent kernel launches after catching the exception.
+|===
+
+Convenience constants `boost::safe_numbers::trapped` and `boost::safe_numbers::untrapped` are provided so the mode can be passed without qualifying the enum:
+
+[source,c++]
+----
+boost::safe_numbers::device_error_context ctx{boost::safe_numbers::untrapped};
+----
+
+[#cuda_device_error_context]
+== The `device_error_context` Class
+
+[source,c++]
+----
+#include <boost/safe_numbers/cuda_error_reporting.hpp>
+
+namespace boost::safe_numbers {
+
+class device_error_context
+{
+public:
+    device_error_context();
+    explicit device_error_context(device_exception_mode e);
+    ~device_error_context();
+
+    device_error_context(const device_error_context&) = delete;
+    device_error_context& operator=(const device_error_context&) = delete;
+
+    void reset();
+    void set_device_exception_method(device_exception_mode e);
+    void synchronize();
+};
+
+} // namespace boost::safe_numbers
+----
+
+The `device_error_context` class manages a CUDA `__managed__` memory buffer used to capture errors from device code.
+When a safe_numbers operation detects an error on the GPU (overflow, underflow, domain error), the error details — file, line, thread ID, expression, and exception type — are written into this shared buffer.
+The host then reads the buffer during `synchronize()` and throws the corresponding `std::exception`.
+
+Only *one* `device_error_context` may exist at a time.
+Constructing a second instance while one is already alive throws `std::logic_error`.
+This constraint prevents races on the shared error buffer.
+
+=== Constructors
+
+[source,c++]
+----
+device_error_context();
+----
+
+Constructs a context with the default `device_exception_mode::trapped` mode.
+Clears any stale error state.
+
+[source,c++]
+----
+explicit device_error_context(device_exception_mode e);
+----
+
+Constructs a context with the specified exception mode.
+Clears any stale error state.
+
+[#cuda_device_error_context_reset]
+=== `reset`
+
+[source,c++]
+----
+void reset();
+----
+
+Clears the error fields (flag, file, line, thread ID, expression) so the context can be reused across kernel launches.
+This is called automatically by the constructors and by `synchronize()` after reading the error state.
+
+[#cuda_device_error_context_set_mode]
+=== `set_device_exception_method`
+
+[source,c++]
+----
+void set_device_exception_method(device_exception_mode e);
+----
+
+Changes the device exception mode after construction.
+This writes to `__managed__` memory, so it takes effect on the next kernel launch.
+
+[#cuda_device_error_context_synchronize]
+=== `synchronize`
+
+[source,c++]
+----
+void synchronize();
+----
+
+Calls `cudaDeviceSynchronize()`, then inspects the managed error buffer.
+If an error was captured by device code, the error state is cleared and the appropriate exception is thrown on the host:
+
+|===
+| Device Error | Host Exception
+
+| Overflow
+| `std::overflow_error`
+
+| Underflow
+| `std::underflow_error`
+
+| Domain error (e.g. division by zero)
+| `std::domain_error`
+
+| Unknown
+| `std::runtime_error`
+|===
+
+The error state is cleared *before* throwing, so after catching the exception the same context is immediately reusable — no manual `reset()` call is needed.
+
+If no device error was captured but `cudaDeviceSynchronize()` returned a non-success status (e.g. from a `__trap()` in trapped mode), a `std::runtime_error` is thrown with the CUDA error string.
+
+== Choosing a Mode
+
+Use `trapped` (the default) when errors must halt execution immediately and silently continuing with wrong results is unacceptable.
+This is the safest option, but the CUDA context cannot be recovered — the process must exit.
+
+Use `untrapped` when you want to detect errors on the host and handle them gracefully (e.g. retry with different inputs, log and continue, or run a fallback path).
+Be aware that other threads in the kernel may continue executing with incorrect values between the point of error and kernel completion.
+
+[source,c++]
+----
+// Trapped mode (default): any device error is immediately fatal
+{
+    boost::safe_numbers::device_error_context ctx;
+    my_kernel<<<blocks, threads>>>(input, output, n);
+
+    try
+    {
+        ctx.synchronize();
+    }
+    catch (const std::runtime_error& e)
+    {
+        // CUDA context is corrupted — log and terminate
+        std::cerr << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
+
+// Untrapped mode: errors are deferred to the host
+{
+    boost::safe_numbers::device_error_context ctx{boost::safe_numbers::untrapped};
+    my_kernel<<<blocks, threads>>>(input, output, n);
+
+    try
+    {
+        ctx.synchronize();
+    }
+    catch (const std::overflow_error& e)
+    {
+        // Context is still valid — can reuse for another launch
+        std::cerr << "Overflow detected: " << e.what() << std::endl;
+    }
+}
+----
 
diff --git a/examples/cuda.cu b/examples/cuda.cu
@@ -11,7 +11,7 @@
 #include <boost/safe_numbers/integer_utilities.hpp>
 #include <boost/safe_numbers/numeric.hpp>
 #include <boost/safe_numbers/charconv.hpp>
-#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/cuda_error_reporting.hpp>
 
 #include <cuda_runtime.h>
 

diff --git a/examples/cuda_error_handling.cu b/examples/cuda_error_handling.cu
@@ -16,7 +16,7 @@
 #include <iostream>
 #include <limits>
 #include <boost/safe_numbers/unsigned_integers.hpp>
-#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/cuda_error_reporting.hpp>
 
 #include <cuda_runtime.h>
 
@@ -51,7 +51,7 @@ int main()
     // Create a single device_error_context for the lifetime of the program.
     // The constructor allocates managed memory for error reporting and
     // clears any stale state.
-    boost::safe_numbers::device_error_context ctx;
+    boost::safe_numbers::device_error_context ctx(boost::safe_numbers::untrapped);
 
     // ---------------------------------------------------------------
     // Step 1: Launch a kernel that overflows and catch the error

diff --git a/...e_numbers/detail/cuda_error_reporting.hpp → ...ost/safe_numbers/cuda_error_reporting.hpp b/...e_numbers/detail/cuda_error_reporting.hpp → ...ost/safe_numbers/cuda_error_reporting.hpp
@@ -29,6 +29,15 @@
 
 namespace boost::safe_numbers {
 
+enum class device_exception_mode : unsigned
+{
+    trapped,
+    untrapped,
+};
+
+inline constexpr auto trapped = device_exception_mode::trapped;
+inline constexpr auto untrapped = device_exception_mode::untrapped;
+
 namespace detail {
 
 enum class exception_type : unsigned
@@ -91,6 +100,10 @@ BOOST_SAFE_NUMBERS_HOST_DEVICE inline void copy_to_buf(char* dst, const char* sr
 // Since we never destroy the CUDA context, __managed__ is safe to use.
 __managed__ cuda_device_error g_device_error {};
 
+// Managed memory enum class that allows us to set what report_device_error should do
+// We default to trapped as that's the best way to ensure hard failure in the event of error
+__managed__ device_exception_mode g_device_fail_type {device_exception_mode::trapped};
+
 // Tracks whether a device_error_context instance is alive.
 // Only one may exist at a time to prevent races on g_device_error.
 inline bool g_device_error_context_active = false;
@@ -112,13 +125,35 @@ __host__ __device__ inline void report_device_error(
         copy_to_buf(g_device_error.file, file, BOOST_SAFE_NUMBERS_DEVICE_ERROR_BUFFER_SIZE);
         copy_to_buf(g_device_error.expression, expression, BOOST_SAFE_NUMBERS_DEVICE_ERROR_BUFFER_SIZE);
         __threadfence_system();
+
+        if (g_device_fail_type == device_exception_mode::trapped)
+        {
+            __trap();
+        }
+    }
+
+    switch (g_device_fail_type)
+    {
+        case device_exception_mode::trapped:
+            // In the event that __trap() is called the error is non-recoverable
+            // The user must terminate the current PROCESS in order to reuse the device
+            // There is currently (3/26) way to recover using the cuda_runtime or hardware APIs
+            // Other threads: spin until the trap terminates the kernel
+            while (true)
+            {
+                __nanosleep(1000000);
+            }
+            break;
+
+        case device_exception_mode::untrapped:
+            // Return instead of calling __trap(). This allows the kernel to
+            // complete normally without corrupting the CUDA context. Other
+            // threads may continue with incorrect values, but synchronize()
+            // will detect the error via the flag and throw on the host.
+            return;
+            break;
     }
 
-    // Return instead of calling __trap(). This allows the kernel to
-    // complete normally without corrupting the CUDA context. Other
-    // threads may continue with incorrect values, but synchronize()
-    // will detect the error via the flag and throw on the host.
-    return;
     #else
 
     const auto msg = std::string(file) + ":" + std::to_string(line) + ": " + expression;
@@ -164,6 +199,21 @@ class device_error_context
         reset();
     }
 
+    // Sets a different error type to our managed global variable
+    device_error_context(const device_exception_mode e)
+    {
+        if (detail::g_device_error_context_active)
+        {
+            BOOST_THROW_EXCEPTION(std::logic_error(
+                "Only one device_error_context may exist at a time"));
+        }
+
+        detail::g_device_fail_type = e;
+
+        detail::g_device_error_context_active = true;
+        reset();
+    }
+
     ~device_error_context()
     {
         detail::g_device_error_context_active = false;
@@ -183,6 +233,12 @@ class device_error_context
         detail::g_device_error.expression[0] = '\0';
     }
 
+    // Adds a post-construction way of setting the failure mode for the device
+    void set_device_exception_method(const device_exception_mode e)
+    {
+        detail::g_device_fail_type = e;
+    }
+
     // Synchronizes the device and checks for errors captured by device code.
     // If an error was detected, the error state is cleared (so the context
     // is immediately reusable), and the appropriate std::exception is thrown.