1+ /*
2+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+ * SPDX-License-Identifier: BSD-3-Clause
4+ *
5+ * Redistribution and use in source and binary forms, with or without
6+ * modification, are permitted provided that the following conditions are met:
7+ *
8+ * 1. Redistributions of source code must retain the above copyright notice, this
9+ * list of conditions and the following disclaimer.
10+ *
11+ * 2. Redistributions in binary form must reproduce the above copyright notice,
12+ * this list of conditions and the following disclaimer in the documentation
13+ * and/or other materials provided with the distribution.
14+ *
15+ * 3. Neither the name of the copyright holder nor the names of its
16+ * contributors may be used to endorse or promote products derived from
17+ * this software without specific prior written permission.
18+ *
19+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29+ */
30+
31+ #pragma once
32+ #include < iostream>
33+ #include < stdio.h>
34+ #include < assert.h>
35+ #include " cuda.h"
36+ #include < memory>
37+ #include < cuda/atomic>
38+
39+ namespace CASError {
40+
41+ enum AtomicStatus {
42+ ATOMIC_NO_ERROR = 0 ,
43+ ATOMIC_ERROR_REPORTED = 1
44+ };
45+
46+ inline
47+ cudaError_t checkCuda (cudaError_t result)
48+ {
49+ #if defined(DEBUG) || defined(_DEBUG)
50+ if (result != cudaSuccess) {
51+ fprintf (stderr, " CUDA Runtime Error: %s\n " ,
52+ cudaGetErrorString (result));
53+ assert (result == cudaSuccess);
54+ }
55+ #endif
56+ return result;
57+ }
58+
59+ // Allocates system-pinned memory of type ErrorType
60+ template < typename ErrorType>
61+ struct PinnedMemory
62+ {
63+ using data_type = ErrorType;
64+
65+ PinnedMemory () {
66+ checkCuda (cudaMallocHost (&hdata, sizeof (data_type)));
67+ }
68+
69+ ~PinnedMemory (){
70+ cudaFreeHost (hdata);
71+ }
72+ data_type *hdata;
73+ };
74+
75+ // DeviceStatus allocates system-pinned memory of StatusType and also allocates corresponding device memory of StatusType
76+ template <typename StatusType>
77+ struct DeviceStatus
78+ {
79+
80+ using status_type = StatusType;
81+
82+ DeviceStatus () {
83+ checkCuda (cudaMallocHost (&host_status, sizeof (status_type), cudaHostAllocMapped));
84+ checkCuda (cudaMalloc (&device_status, sizeof (status_type)));
85+ }
86+
87+ ~DeviceStatus () {
88+ checkCuda (cudaFreeHost (host_status));
89+ checkCuda (cudaFree (device_status));
90+ }
91+
92+ status_type __host__ status () {
93+ return static_cast <volatile cuda::std::atomic<StatusType> *>(host_status)->load (cuda::memory_order_acquire);
94+ }
95+
96+ cuda::std::atomic<StatusType> *host_status;
97+ StatusType *device_status;
98+ };
99+
100+ // This struct represents the data accessible and modifiable on the device and contains pointers to relevant information
101+ template <typename ErrorType>
102+ struct MappedErrorTypeDeviceData {
103+ using status_type = AtomicStatus;
104+ // these two members are used so that they can be accessed from the device directly
105+ cuda::std::atomic<status_type> * host_status;
106+ status_type * device_status;
107+
108+ // pointer to pinned data to be accessed from the device directly
109+ ErrorType * host_data;
110+
111+ void inline __device__ synchronizeStatus () {
112+ host_status->store (ATOMIC_ERROR_REPORTED, cuda::memory_order_release);
113+ }
114+ };
115+
116+
117+ /* The MappedErrorType creates system-pinned memory of ErrorType, as well as corresponding DeviceStatus.
118+ * Using the available methods guarantees the necessary memory fences to avoid asynchronous race conditions.
119+ */
120+
121+ template <typename ErrorType>
122+ struct MappedErrorType
123+ {
124+ // Use same status type as MappedErrorTypeDeviceData
125+ using status_type = typename MappedErrorTypeDeviceData<ErrorType>::status_type;
126+
127+ // System-pinned error payload to be written by supplied function
128+ std::shared_ptr<PinnedMemory<ErrorType>> error_data;
129+
130+ // Error reporting indicator to coordinate asynchronous soft error reporting
131+ std::shared_ptr<DeviceStatus<status_type>> status;
132+
133+ // The necessary device-side pointers needed for proper reporting
134+ MappedErrorTypeDeviceData<ErrorType> deviceData;
135+
136+ MappedErrorType (cudaStream_t stream = 0 )
137+ : error_data(new PinnedMemory<ErrorType>()),
138+ status (new DeviceStatus<status_type>()),
139+ deviceData (MappedErrorTypeDeviceData<ErrorType>{.host_status =status->host_status ,
140+ .device_status =status->device_status ,
141+ .host_data = error_data->hdata })
142+ {
143+ deviceData.host_status ->store (ATOMIC_NO_ERROR, cuda::memory_order_release);
144+ checkCuda (cudaMemsetAsync (deviceData.device_status , ATOMIC_NO_ERROR, sizeof (status_type), stream));
145+ }
146+
147+ // / Checks on the host if an error has been reported
148+ bool __host__ checkErrorReported () {
149+ return (status->status () == ATOMIC_ERROR_REPORTED);
150+ }
151+
152+ /* * Returns host-pinned error payload
153+ * Note: If error data includes device pointers (e.g. const char*) you will need to properly post-processes these pointers
154+ */
155+ volatile ErrorType & __host__ get () {
156+ return *static_cast <volatile ErrorType *>(deviceData.host_data );
157+ }
158+
159+ // / Clears both the device-side status and host-side
160+ void __host__ clear (cudaStream_t stream = 0 ) {
161+ checkCuda (cudaMemsetAsync (deviceData.device_status , ATOMIC_NO_ERROR, sizeof (status_type), stream));
162+ (deviceData.host_status )->store (ATOMIC_NO_ERROR, cuda::memory_order_release);
163+ }
164+
165+ void inline __device__ synchronizeStatus () {
166+ deviceData.synchronizeStatus ();
167+ }
168+
169+ };
170+
171+ // / Retrieve variable like __FILE__ when set in device memory to host
172+ std::string getDeviceString (const char * device_string, cudaStream_t stream = 0 ) {
173+ CUdeviceptr pbase;
174+ std::size_t psize;
175+ cuMemGetAddressRange (&pbase, &psize, reinterpret_cast <CUdeviceptr>(device_string));
176+ std::string str;
177+ str.resize (psize);
178+ cudaMemcpyAsync (str.data (), device_string, psize, cudaMemcpyDeviceToHost, stream);
179+ return str;
180+ }
181+
182+ template <typename ErrorType, typename FunctionType>
183+ inline __device__ void report_first_error (MappedErrorType<ErrorType> & error_dat, FunctionType func){
184+ if (atomicCAS (reinterpret_cast <int *>(error_dat.deviceData .device_status ), static_cast <int >(ATOMIC_NO_ERROR), static_cast <int >(ATOMIC_ERROR_REPORTED)) == static_cast <int >(ATOMIC_NO_ERROR) ) {
185+ func (*error_dat.deviceData .host_data );
186+ __threadfence_system ();
187+ error_dat.synchronizeStatus ();
188+ }
189+ }
190+
191+ }
0 commit comments