diff --git a/doc/modules/ROOT/pages/cuda.adoc b/doc/modules/ROOT/pages/cuda.adoc index 2d7a0a3..2da5f89 100644 --- a/doc/modules/ROOT/pages/cuda.adoc +++ b/doc/modules/ROOT/pages/cuda.adoc @@ -290,3 +290,8 @@ Be aware that other threads in the kernel may continue executing with incorrect } ---- +== Examples + +* xref:examples.adoc#examples_cuda[CUDA Device Support] — demonstrates that all safe_numbers types and free functions work on a CUDA device. +* xref:examples.adoc#examples_cuda_error_handling[CUDA Error Handling] — shows how to use `device_error_context` to catch device-side overflow on the host and recover gracefully. +* xref:examples.adoc#examples_cuda_error_handling_without_error_context[CUDA Error Handling Without Error Context] — demonstrates what happens when an overflow occurs on the device *without* `device_error_context`: the CUDA context is irrecoverably corrupted and no further kernels can be launched. diff --git a/doc/modules/ROOT/pages/examples.adoc b/doc/modules/ROOT/pages/examples.adoc index fb31fb3..8122f13 100644 --- a/doc/modules/ROOT/pages/examples.adoc +++ b/doc/modules/ROOT/pages/examples.adoc @@ -608,3 +608,29 @@ result[2] = 31 result[3] = 41 ---- ==== + +[#examples_cuda_error_handling_without_error_context] +== CUDA Error Handling Without Error Context + +This example demonstrates what happens when a safe_numbers overflow occurs on a CUDA device *without* using `device_error_context`. +The overflow triggers a device-side trap that corrupts the CUDA context, making `cudaDeviceSynchronize()` report an unspecified launch failure. +Even after calling `cudaDeviceReset()`, the process cannot launch any further kernels — all subsequent CUDA operations fail. +This motivates the use of `device_error_context` shown in the previous example. + +.This https://github.com/boostorg/safe_numbers/blob/develop/examples/cuda_error_handling_without_error_context.cu[example] demonstrates the consequences of not using `device_error_context` for GPU error handling. +==== +[source, c++] +---- +include::example$cuda_error_handling_without_error_context.cu[] +---- + +Output: +---- +=== Launching kernel that overflows === +Kernel failed with error: unspecified launch failure +Kernel has been reset via CUDA API + +=== Launching kernel with valid arithmetic === +Kernel failed with error: CUDA-capable device(s) is/are busy or unavailable +---- +==== diff --git a/examples/cuda_error_handling_without_error_context.cu b/examples/cuda_error_handling_without_error_context.cu new file mode 100644 index 0000000..eeb704d --- /dev/null +++ b/examples/cuda_error_handling_without_error_context.cu @@ -0,0 +1,137 @@ +// Copyright 2026 Matt Borland +// Distributed under the Boost Software License, Version 1.0. +// https://www.boost.org/LICENSE_1_0.txt + +// This example demonstrates how to catch arithmetic errors that occur +// on a CUDA device using device_error_context. When a safe_numbers +// operation overflows on the GPU, the error is captured in managed +// memory and rethrown with BOOST_THROW_EXCEPTION on the host when +// you call ctx.synchronize(). +// +// The device_error_context manages a dynamically allocated managed +// memory buffer. When an error is detected, synchronize() clears the +// error state and throws. After catching the exception, the same +// context can be reused immediately for new kernel launches. + +#include +#include +#include +#include + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +// This kernel deliberately overflows: it adds 1 to the maximum u32 value +__global__ void overflow_kernel(test_type* out) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i == 0) + { + const test_type max_val {(std::numeric_limits::max)()}; + out[0] = max_val + test_type{1}; // Overflow! + } +} + +// This kernel performs valid arithmetic +__global__ void safe_kernel(const test_type* in, test_type* out, int n) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < n) + { + out[i] = in[i] + test_type{1}; + } +} + +int main() +{ + // --------------------------------------------------------------- + // Step 1: Launch a kernel that overflows + // + // Instead of using the booost::safe_numbers::cuda_error_context + // we will use standard CUDA machinery in case the user forgets + // or does not want to use it. + // --------------------------------------------------------------- + + test_type* result = nullptr; + cudaMallocManaged(&result, sizeof(test_type)); + cudaDeviceSynchronize(); + + std::cout << "=== Launching kernel that overflows ===" << std::endl; + + overflow_kernel<<<1, 1>>>(result); + auto status = cudaDeviceSynchronize(); + + // The default should have trapped the kernel, leading to failure conditions + // Attempt to reset the device (which we know fails) + if (status != cudaSuccess) + { + const auto error = cudaGetLastError(); + std::cerr << "Kernel failed with error: " << cudaGetErrorString(error) << std::endl; + cudaDeviceReset(); + std::cerr << "Kernel has been reset via CUDA API" << std::endl; + } + + // --------------------------------------------------------------- + // Step 2: Try to launch another kernel from the same process + // + // Since we have trapped the CUDA device in the background, + // it is unable to be launched again from the same process. + // This error cannot be cleared even though we called cudaDeviceReset() + // --------------------------------------------------------------- + + std::cout << "\n=== Launching kernel with valid arithmetic ===" << std::endl; + + test_type* data = nullptr; + test_type* out = nullptr; + + cudaMallocManaged(&data, 4 * sizeof(test_type)); + cudaMallocManaged(&out, 4 * sizeof(test_type)); + status = cudaDeviceSynchronize(); + + if (status != cudaSuccess) + { + const auto error = cudaGetLastError(); + std::cerr << "Kernel failed with error: " << cudaGetErrorString(error) << std::endl; + } + else + { + // If we had not checked status, this following code would have terminated with SegFault at data[0] + + data[0] = test_type{10}; + data[1] = test_type{20}; + data[2] = test_type{30}; + data[3] = test_type{40}; + + safe_kernel<<<1, 4>>>(data, out, 4); + status = cudaDeviceSynchronize(); + + if (status != cudaSuccess) + { + const auto error = cudaGetLastError(); + std::cerr << "Kernel failed with error: " << cudaGetErrorString(error) << std::endl; + } + } + + // --------------------------------------------------------------- + // Cleanup + // --------------------------------------------------------------- + + if (result != nullptr) + { + cudaFree(result); + } + if (data != nullptr) + { + cudaFree(data); + } + if (out != nullptr) + { + cudaFree(out); + } + + return 0; +} diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 43241bc..24b7ddd 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -312,3 +312,4 @@ run test_cuda_u128_midpoint.cu ; # Examples run ../examples/cuda.cu ; run ../examples/cuda_error_handling.cu ; +run ../examples/cuda_error_handling_without_error_context.cu ;