xref: /aosp_15_r20/external/pytorch/c10/cuda/CUDADeviceAssertionHost.cpp (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #include <c10/cuda/CUDADeviceAssertionHost.h>
2 #include <c10/cuda/CUDAException.h>
3 #include <c10/cuda/CUDAFunctions.h>
4 #include <c10/util/Backtrace.h>
5 #include <c10/util/Exception.h>
6 #include <c10/util/irange.h>
7 #include <cuda_runtime.h>
8 
9 #include <memory>
10 #include <string>
11 #ifdef TORCH_USE_CUDA_DSA
12 #include <chrono>
13 #include <thread>
14 #endif
15 
16 #define C10_CUDA_CHECK_WO_DSA(EXPR)                                 \
17   do {                                                              \
18     const cudaError_t __err = EXPR;                                 \
19     c10::cuda::c10_cuda_check_implementation(                       \
20         static_cast<int32_t>(__err),                                \
21         __FILE__,                                                   \
22         __func__, /* Line number data type not well-defined between \
23                       compilers, so we perform an explicit cast */  \
24         static_cast<uint32_t>(__LINE__),                            \
25         false);                                                     \
26   } while (0)
27 
28 namespace c10::cuda {
29 
30 namespace {
31 
32 #ifdef TORCH_USE_CUDA_DSA
33 /// Get current device id
34 /// We need our own implementation of this function to prevent
35 /// an infinite initialization loop for CUDAKernelLaunchRegistry
dsa_get_device_id()36 int dsa_get_device_id() {
37   c10::DeviceIndex device = -1;
38   C10_CUDA_CHECK_WO_DSA(c10::cuda::GetDevice(&device));
39   return device;
40 }
41 
42 /// Get a device's compute capability - note that this dangerously assumes
43 /// that if one CUDA GPU supports device-side assertions they all do. This is
44 /// probably fine since the latest CUDA GPU that doesn't support UVM is the
45 /// K80 released 2014-11-17. Mixing that GPU with a newer one is likely to be
46 /// rare enough that the defensive
47 /// We need our own implementation of this function to prevent
48 /// an infinite initialization loop for CUDAKernelLaunchRegistry
dsa_get_device_compute_capability(const int device_num)49 int dsa_get_device_compute_capability(const int device_num) {
50   int compute_capability = -1;
51   C10_CUDA_CHECK_WO_DSA(cudaDeviceGetAttribute(
52       &compute_capability, cudaDevAttrComputeCapabilityMajor, device_num));
53   return compute_capability;
54 }
55 #endif
56 
57 /// Get the number of CUDA devices
58 /// We need our own implementation of this function to prevent
59 /// an infinite initialization loop for CUDAKernelLaunchRegistry
dsa_get_device_count()60 int dsa_get_device_count() {
61   int device_count = -1;
62   C10_CUDA_CHECK_WO_DSA(c10::cuda::GetDeviceCount(&device_count));
63   return device_count;
64 }
65 
dsa_check_if_all_devices_support_managed_memory()66 bool dsa_check_if_all_devices_support_managed_memory() {
67 // It looks as though this'll work best on CUDA GPUs with Pascal
68 // architectures or newer, per
69 // https://developer.nvidia.com/blog/unified-memory-cuda-beginners/
70 #ifdef TORCH_USE_CUDA_DSA
71   for (const auto i : c10::irange(dsa_get_device_count())) {
72     if (dsa_get_device_compute_capability(i) < 6) {
73       return false;
74     }
75   }
76   return true;
77 #else
78   return false;
79 #endif
80 }
81 
env_flag_set(const char * env_var_name)82 bool env_flag_set(const char* env_var_name) {
83   const char* const env_string = std::getenv(env_var_name);
84   return (env_string == nullptr) ? false : std::strcmp(env_string, "0");
85 }
86 
87 /// Deleter for UVM/managed memory pointers
uvm_deleter(DeviceAssertionsData * uvm_assertions_ptr)88 void uvm_deleter(DeviceAssertionsData* uvm_assertions_ptr) {
89   // Ignore error in destructor
90   if (uvm_assertions_ptr) {
91     C10_CUDA_IGNORE_ERROR(cudaFree(uvm_assertions_ptr));
92   }
93 }
94 
95 } // namespace
96 
97 /// Check that kernels ran correctly by checking the message buffer. BLOCKING.
c10_retrieve_device_side_assertion_info()98 std::string c10_retrieve_device_side_assertion_info() {
99 #ifdef TORCH_USE_CUDA_DSA
100   const auto& launch_registry = CUDAKernelLaunchRegistry::get_singleton_ref();
101   if (!launch_registry.enabled_at_runtime) {
102     return "Device-side assertion tracking was not enabled by user.";
103   } else if (!launch_registry.do_all_devices_support_managed_memory) {
104     return "Device-side assertions disabled because not all devices support managed memory.";
105   }
106 
107   // Hack that saves a lot of challenging sync logic.
108   // The GPU increments the number of errors it's observed and the CPU can see
109   // that happening immediately which means we can make it here before the GPU
110   // is done writing information about those errors to memory.
111   // A short pause gives it time to finish. Since something's gone wrong, this
112   // pause shouldn't affect perf.
113   std::this_thread::sleep_for(std::chrono::seconds(1));
114 
115   // The snapshot causes a brief block. That's okay because this function only
116   // executes if something's gone wrong such that speed is no longer a priority.
117   const auto launch_data = launch_registry.snapshot();
118   const auto& assertion_data = launch_data.first;
119   const auto& launch_infos = launch_data.second;
120 
121   std::stringstream oss;
122 
123   oss << "Looking for device-side assertion failure information...\n";
124 
125   // Loop over each device that could be managed by the process
126   for (const auto device_num : c10::irange(assertion_data.size())) {
127     const auto& assertion_data_for_device = assertion_data.at(device_num);
128 
129     // Did anything fail?
130     const auto failures_found = std::min(
131         assertion_data_for_device.assertion_count,
132         C10_CUDA_DSA_ASSERTION_COUNT);
133     if (failures_found == 0) {
134       continue;
135     }
136 
137     // Something failed, let's talk about that
138     oss << failures_found
139         << " CUDA device-side assertion failures were found on GPU #"
140         << device_num << "!" << std::endl;
141     if (assertion_data_for_device.assertion_count >
142         C10_CUDA_DSA_ASSERTION_COUNT) {
143       oss << "But at least " << assertion_data_for_device.assertion_count
144           << " assertion failures occurred on the device" << std::endl;
145       oss << "Adjust `C10_CUDA_DSA_ASSERTION_COUNT` if you need more assertion failure info"
146           << std::endl;
147     }
148 
149     for (const auto i : c10::irange(failures_found)) {
150       const auto& self = assertion_data_for_device.assertions[i];
151       const auto& launch_info = launch_infos[self.caller % launch_infos.size()];
152       oss << "Assertion failure " << i << std::endl;
153       oss << "  GPU assertion failure message = " << self.assertion_msg
154           << std::endl;
155       oss << "  File containing assertion = " << self.filename << ":"
156           << self.line_number << std::endl;
157       oss << "  Device function containing assertion = " << self.function_name
158           << std::endl;
159       oss << "  Thread ID that failed assertion = [" << self.thread_id[0] << ","
160           << self.thread_id[1] << "," << self.thread_id[2] << "]" << std::endl;
161       oss << "  Block ID that failed assertion = [" << self.block_id[0] << ","
162           << self.block_id[1] << "," << self.block_id[2] << "]" << std::endl;
163       if (launch_info.generation_number == self.caller) {
164         oss << "  File containing kernel launch = "
165             << launch_info.launch_filename << ":" << launch_info.launch_linenum
166             << std::endl;
167         oss << "  Function containing kernel launch = "
168             << launch_info.launch_function << std::endl;
169         oss << "  Name of kernel launched that led to failure = "
170             << launch_info.kernel_name << std::endl;
171         oss << "  Device that launched kernel = " << launch_info.device
172             << std::endl;
173         oss << "  Stream kernel was launched on = " << launch_info.stream
174             << std::endl;
175         oss << "  Backtrace of kernel launch site = ";
176         if (launch_registry.gather_launch_stacktrace) {
177           oss << "Launch stacktracing disabled." << std::endl;
178         } else {
179           oss << "\n" << launch_info.launch_stacktrace << std::endl;
180         }
181       } else {
182         oss << "  CPU launch site info: Unavailable, the circular queue wrapped around. Increase `CUDAKernelLaunchRegistry::max_size`."
183             << std::endl;
184       }
185     }
186   }
187   return oss.str();
188 #else
189   return "Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n";
190 #endif
191 }
192 
CUDAKernelLaunchRegistry()193 CUDAKernelLaunchRegistry::CUDAKernelLaunchRegistry()
194     : do_all_devices_support_managed_memory(
195           dsa_check_if_all_devices_support_managed_memory()),
196       gather_launch_stacktrace(check_env_for_enable_launch_stacktracing()),
197       enabled_at_runtime(check_env_for_dsa_enabled()) {
198   for (C10_UNUSED const auto _ : c10::irange(dsa_get_device_count())) {
199     uvm_assertions.emplace_back(nullptr, uvm_deleter);
200   }
201 
202   kernel_launches.resize(max_kernel_launches);
203 }
204 
check_env_for_enable_launch_stacktracing() const205 bool CUDAKernelLaunchRegistry::check_env_for_enable_launch_stacktracing()
206     const {
207   return env_flag_set("PYTORCH_CUDA_DSA_STACKTRACING");
208 }
209 
check_env_for_dsa_enabled() const210 bool CUDAKernelLaunchRegistry::check_env_for_dsa_enabled() const {
211   return env_flag_set("PYTORCH_USE_CUDA_DSA");
212 }
213 
insert(const char * launch_filename,const char * launch_function,const uint32_t launch_linenum,const char * kernel_name,const int32_t stream_id)214 uint32_t CUDAKernelLaunchRegistry::insert(
215     const char* launch_filename,
216     const char* launch_function,
217     const uint32_t launch_linenum,
218     const char* kernel_name,
219     const int32_t stream_id) {
220 #ifdef TORCH_USE_CUDA_DSA
221   if (!enabled_at_runtime) {
222     return 0;
223   }
224 
225   const auto backtrace = gather_launch_stacktrace ? c10::get_backtrace() : "";
226 
227   const std::lock_guard<std::mutex> lock(read_write_mutex);
228 
229   const auto my_gen_number = generation_number++;
230   // TODO: It would probably be good to get a stack trace here so that
231   // we can better indicate which launch caused the failure.
232   kernel_launches[my_gen_number % max_kernel_launches] = {
233       launch_filename,
234       launch_function,
235       launch_linenum,
236       backtrace,
237       kernel_name,
238       dsa_get_device_id(),
239       stream_id,
240       my_gen_number};
241   return my_gen_number;
242 #else
243   return 0;
244 #endif
245 }
246 
247 std::pair<std::vector<DeviceAssertionsData>, std::vector<CUDAKernelLaunchInfo>>
snapshot() const248 CUDAKernelLaunchRegistry::snapshot() const {
249   // This is likely to be the longest-lasting hold on the mutex, but
250   // we only expect it to be called in cases where we're already failing
251   // and speed is no longer important
252   const std::lock_guard<std::mutex> lock(read_write_mutex);
253 
254   std::vector<DeviceAssertionsData> device_assertions_data;
255   for (const auto& x : uvm_assertions) {
256     if (x) {
257       device_assertions_data.push_back(*x);
258     } else {
259       device_assertions_data.emplace_back();
260     }
261   }
262 
263   return std::make_pair(device_assertions_data, kernel_launches);
264 }
265 
266 DeviceAssertionsData* CUDAKernelLaunchRegistry::
get_uvm_assertions_ptr_for_current_device()267     get_uvm_assertions_ptr_for_current_device() {
268 #ifdef TORCH_USE_CUDA_DSA
269   if (!enabled_at_runtime) {
270     return nullptr;
271   }
272 
273   const auto device_num = dsa_get_device_id();
274 
275   // If we've already set up this GPU with managed memory, return a pointer to
276   // the managed memory. This is a lock-free quick-return path.
277   if (uvm_assertions.at(device_num)) {
278     return uvm_assertions.at(device_num).get();
279   }
280 
281   // Need a lock here so there's not race-condition on creating the new device
282   // assertions buffer
283   const std::lock_guard<std::mutex> lock(gpu_alloc_mutex);
284 
285   // If we've already set up this GPU with managed memory, return a pointer to
286   // the managed memory. This locked path ensures that the device memory is
287   // allocated only once
288   if (uvm_assertions.at(device_num)) {
289     return uvm_assertions.at(device_num).get();
290   }
291 
292   // Otherwise, set up the GPU to be able to use the device-side assertion
293   // system
294   DeviceAssertionsData* uvm_assertions_ptr = nullptr;
295 
296   C10_CUDA_CHECK_WO_DSA(
297       cudaMallocManaged(&uvm_assertions_ptr, sizeof(DeviceAssertionsData)));
298 
299   C10_CUDA_CHECK_WO_DSA(cudaMemAdvise(
300       uvm_assertions_ptr,
301       sizeof(DeviceAssertionsData),
302       cudaMemAdviseSetPreferredLocation,
303       cudaCpuDeviceId));
304 
305   // GPU will establish direct mapping of data in CPU memory, no page faults
306   // will be generated
307   C10_CUDA_CHECK_WO_DSA(cudaMemAdvise(
308       uvm_assertions_ptr,
309       sizeof(DeviceAssertionsData),
310       cudaMemAdviseSetAccessedBy,
311       cudaCpuDeviceId));
312 
313   // Initialize the memory from the CPU; otherwise, pages may have to be created
314   // on demand. We think that UVM documentation indicates that first access may
315   // not honor preferred location, which would be bad, if true, because we want
316   // this memory on the host so we can access it post-assertion. Initializing
317   // this on the CPU helps ensure that that's where the memory will live.
318   *uvm_assertions_ptr = DeviceAssertionsData();
319 
320   // Ownership and lifetime management of `uvm_assertions_ptr` now passes to the
321   // uvm_assertions unique_ptr vector
322   uvm_assertions.at(device_num).reset(uvm_assertions_ptr);
323 
324   return uvm_assertions_ptr;
325 #else
326   return nullptr;
327 #endif
328 }
329 
get_singleton_ref()330 CUDAKernelLaunchRegistry& CUDAKernelLaunchRegistry::get_singleton_ref() {
331   static CUDAKernelLaunchRegistry launch_registry;
332   return launch_registry;
333 }
334 
has_failed() const335 bool CUDAKernelLaunchRegistry::has_failed() const {
336   for (const auto& x : uvm_assertions) {
337     if (x && x->assertion_count > 0) {
338       return true;
339     }
340   }
341   return false;
342 }
343 
344 } // namespace c10::cuda
345