#include #include #include #include #include #include #include #include #include #ifdef TORCH_USE_CUDA_DSA #include #include #endif #define C10_CUDA_CHECK_WO_DSA(EXPR) \ do { \ const cudaError_t __err = EXPR; \ c10::cuda::c10_cuda_check_implementation( \ static_cast(__err), \ __FILE__, \ __func__, /* Line number data type not well-defined between \ compilers, so we perform an explicit cast */ \ static_cast(__LINE__), \ false); \ } while (0) namespace c10::cuda { namespace { #ifdef TORCH_USE_CUDA_DSA /// Get current device id /// We need our own implementation of this function to prevent /// an infinite initialization loop for CUDAKernelLaunchRegistry int dsa_get_device_id() { c10::DeviceIndex device = -1; C10_CUDA_CHECK_WO_DSA(c10::cuda::GetDevice(&device)); return device; } /// Get a device's compute capability - note that this dangerously assumes /// that if one CUDA GPU supports device-side assertions they all do. This is /// probably fine since the latest CUDA GPU that doesn't support UVM is the /// K80 released 2014-11-17. Mixing that GPU with a newer one is likely to be /// rare enough that the defensive /// We need our own implementation of this function to prevent /// an infinite initialization loop for CUDAKernelLaunchRegistry int dsa_get_device_compute_capability(const int device_num) { int compute_capability = -1; C10_CUDA_CHECK_WO_DSA(cudaDeviceGetAttribute( &compute_capability, cudaDevAttrComputeCapabilityMajor, device_num)); return compute_capability; } #endif /// Get the number of CUDA devices /// We need our own implementation of this function to prevent /// an infinite initialization loop for CUDAKernelLaunchRegistry int dsa_get_device_count() { int device_count = -1; C10_CUDA_CHECK_WO_DSA(c10::cuda::GetDeviceCount(&device_count)); return device_count; } bool dsa_check_if_all_devices_support_managed_memory() { // It looks as though this'll work best on CUDA GPUs with Pascal // architectures or newer, per // https://developer.nvidia.com/blog/unified-memory-cuda-beginners/ #ifdef TORCH_USE_CUDA_DSA for (const auto i : c10::irange(dsa_get_device_count())) { if (dsa_get_device_compute_capability(i) < 6) { return false; } } return true; #else return false; #endif } bool env_flag_set(const char* env_var_name) { const char* const env_string = std::getenv(env_var_name); return (env_string == nullptr) ? false : std::strcmp(env_string, "0"); } /// Deleter for UVM/managed memory pointers void uvm_deleter(DeviceAssertionsData* uvm_assertions_ptr) { // Ignore error in destructor if (uvm_assertions_ptr) { C10_CUDA_IGNORE_ERROR(cudaFree(uvm_assertions_ptr)); } } } // namespace /// Check that kernels ran correctly by checking the message buffer. BLOCKING. std::string c10_retrieve_device_side_assertion_info() { #ifdef TORCH_USE_CUDA_DSA const auto& launch_registry = CUDAKernelLaunchRegistry::get_singleton_ref(); if (!launch_registry.enabled_at_runtime) { return "Device-side assertion tracking was not enabled by user."; } else if (!launch_registry.do_all_devices_support_managed_memory) { return "Device-side assertions disabled because not all devices support managed memory."; } // Hack that saves a lot of challenging sync logic. // The GPU increments the number of errors it's observed and the CPU can see // that happening immediately which means we can make it here before the GPU // is done writing information about those errors to memory. // A short pause gives it time to finish. Since something's gone wrong, this // pause shouldn't affect perf. std::this_thread::sleep_for(std::chrono::seconds(1)); // The snapshot causes a brief block. That's okay because this function only // executes if something's gone wrong such that speed is no longer a priority. const auto launch_data = launch_registry.snapshot(); const auto& assertion_data = launch_data.first; const auto& launch_infos = launch_data.second; std::stringstream oss; oss << "Looking for device-side assertion failure information...\n"; // Loop over each device that could be managed by the process for (const auto device_num : c10::irange(assertion_data.size())) { const auto& assertion_data_for_device = assertion_data.at(device_num); // Did anything fail? const auto failures_found = std::min( assertion_data_for_device.assertion_count, C10_CUDA_DSA_ASSERTION_COUNT); if (failures_found == 0) { continue; } // Something failed, let's talk about that oss << failures_found << " CUDA device-side assertion failures were found on GPU #" << device_num << "!" << std::endl; if (assertion_data_for_device.assertion_count > C10_CUDA_DSA_ASSERTION_COUNT) { oss << "But at least " << assertion_data_for_device.assertion_count << " assertion failures occurred on the device" << std::endl; oss << "Adjust `C10_CUDA_DSA_ASSERTION_COUNT` if you need more assertion failure info" << std::endl; } for (const auto i : c10::irange(failures_found)) { const auto& self = assertion_data_for_device.assertions[i]; const auto& launch_info = launch_infos[self.caller % launch_infos.size()]; oss << "Assertion failure " << i << std::endl; oss << " GPU assertion failure message = " << self.assertion_msg << std::endl; oss << " File containing assertion = " << self.filename << ":" << self.line_number << std::endl; oss << " Device function containing assertion = " << self.function_name << std::endl; oss << " Thread ID that failed assertion = [" << self.thread_id[0] << "," << self.thread_id[1] << "," << self.thread_id[2] << "]" << std::endl; oss << " Block ID that failed assertion = [" << self.block_id[0] << "," << self.block_id[1] << "," << self.block_id[2] << "]" << std::endl; if (launch_info.generation_number == self.caller) { oss << " File containing kernel launch = " << launch_info.launch_filename << ":" << launch_info.launch_linenum << std::endl; oss << " Function containing kernel launch = " << launch_info.launch_function << std::endl; oss << " Name of kernel launched that led to failure = " << launch_info.kernel_name << std::endl; oss << " Device that launched kernel = " << launch_info.device << std::endl; oss << " Stream kernel was launched on = " << launch_info.stream << std::endl; oss << " Backtrace of kernel launch site = "; if (launch_registry.gather_launch_stacktrace) { oss << "Launch stacktracing disabled." << std::endl; } else { oss << "\n" << launch_info.launch_stacktrace << std::endl; } } else { oss << " CPU launch site info: Unavailable, the circular queue wrapped around. Increase `CUDAKernelLaunchRegistry::max_size`." << std::endl; } } } return oss.str(); #else return "Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n"; #endif } CUDAKernelLaunchRegistry::CUDAKernelLaunchRegistry() : do_all_devices_support_managed_memory( dsa_check_if_all_devices_support_managed_memory()), gather_launch_stacktrace(check_env_for_enable_launch_stacktracing()), enabled_at_runtime(check_env_for_dsa_enabled()) { for (C10_UNUSED const auto _ : c10::irange(dsa_get_device_count())) { uvm_assertions.emplace_back(nullptr, uvm_deleter); } kernel_launches.resize(max_kernel_launches); } bool CUDAKernelLaunchRegistry::check_env_for_enable_launch_stacktracing() const { return env_flag_set("PYTORCH_CUDA_DSA_STACKTRACING"); } bool CUDAKernelLaunchRegistry::check_env_for_dsa_enabled() const { return env_flag_set("PYTORCH_USE_CUDA_DSA"); } uint32_t CUDAKernelLaunchRegistry::insert( const char* launch_filename, const char* launch_function, const uint32_t launch_linenum, const char* kernel_name, const int32_t stream_id) { #ifdef TORCH_USE_CUDA_DSA if (!enabled_at_runtime) { return 0; } const auto backtrace = gather_launch_stacktrace ? c10::get_backtrace() : ""; const std::lock_guard lock(read_write_mutex); const auto my_gen_number = generation_number++; // TODO: It would probably be good to get a stack trace here so that // we can better indicate which launch caused the failure. kernel_launches[my_gen_number % max_kernel_launches] = { launch_filename, launch_function, launch_linenum, backtrace, kernel_name, dsa_get_device_id(), stream_id, my_gen_number}; return my_gen_number; #else return 0; #endif } std::pair, std::vector> CUDAKernelLaunchRegistry::snapshot() const { // This is likely to be the longest-lasting hold on the mutex, but // we only expect it to be called in cases where we're already failing // and speed is no longer important const std::lock_guard lock(read_write_mutex); std::vector device_assertions_data; for (const auto& x : uvm_assertions) { if (x) { device_assertions_data.push_back(*x); } else { device_assertions_data.emplace_back(); } } return std::make_pair(device_assertions_data, kernel_launches); } DeviceAssertionsData* CUDAKernelLaunchRegistry:: get_uvm_assertions_ptr_for_current_device() { #ifdef TORCH_USE_CUDA_DSA if (!enabled_at_runtime) { return nullptr; } const auto device_num = dsa_get_device_id(); // If we've already set up this GPU with managed memory, return a pointer to // the managed memory. This is a lock-free quick-return path. if (uvm_assertions.at(device_num)) { return uvm_assertions.at(device_num).get(); } // Need a lock here so there's not race-condition on creating the new device // assertions buffer const std::lock_guard lock(gpu_alloc_mutex); // If we've already set up this GPU with managed memory, return a pointer to // the managed memory. This locked path ensures that the device memory is // allocated only once if (uvm_assertions.at(device_num)) { return uvm_assertions.at(device_num).get(); } // Otherwise, set up the GPU to be able to use the device-side assertion // system DeviceAssertionsData* uvm_assertions_ptr = nullptr; C10_CUDA_CHECK_WO_DSA( cudaMallocManaged(&uvm_assertions_ptr, sizeof(DeviceAssertionsData))); C10_CUDA_CHECK_WO_DSA(cudaMemAdvise( uvm_assertions_ptr, sizeof(DeviceAssertionsData), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId)); // GPU will establish direct mapping of data in CPU memory, no page faults // will be generated C10_CUDA_CHECK_WO_DSA(cudaMemAdvise( uvm_assertions_ptr, sizeof(DeviceAssertionsData), cudaMemAdviseSetAccessedBy, cudaCpuDeviceId)); // Initialize the memory from the CPU; otherwise, pages may have to be created // on demand. We think that UVM documentation indicates that first access may // not honor preferred location, which would be bad, if true, because we want // this memory on the host so we can access it post-assertion. Initializing // this on the CPU helps ensure that that's where the memory will live. *uvm_assertions_ptr = DeviceAssertionsData(); // Ownership and lifetime management of `uvm_assertions_ptr` now passes to the // uvm_assertions unique_ptr vector uvm_assertions.at(device_num).reset(uvm_assertions_ptr); return uvm_assertions_ptr; #else return nullptr; #endif } CUDAKernelLaunchRegistry& CUDAKernelLaunchRegistry::get_singleton_ref() { static CUDAKernelLaunchRegistry launch_registry; return launch_registry; } bool CUDAKernelLaunchRegistry::has_failed() const { for (const auto& x : uvm_assertions) { if (x && x->assertion_count > 0) { return true; } } return false; } } // namespace c10::cuda