1 #pragma once 2 3 #include <c10/cuda/CUDAMacros.h> 4 5 #include <cstdint> 6 #include <memory> 7 #include <mutex> 8 #include <string> 9 #include <utility> 10 #include <vector> 11 12 #ifdef USE_CUDA 13 #define TORCH_USE_CUDA_DSA 14 #endif 15 16 /// Number of assertion failure messages we can store. If this is too small 17 /// threads will fail silently. 18 constexpr int C10_CUDA_DSA_ASSERTION_COUNT = 10; 19 constexpr int C10_CUDA_DSA_MAX_STR_LEN = 512; 20 21 namespace c10::cuda { 22 23 /// Holds information about any device-side assertions that fail. 24 /// Held in managed memory and access by both the CPU and the GPU. 25 struct DeviceAssertionData { 26 /// Stringification of the assertion 27 // NOLINTNEXTLINE(*-c-arrays) 28 char assertion_msg[C10_CUDA_DSA_MAX_STR_LEN]{}; 29 /// File the assertion was in 30 // NOLINTNEXTLINE(*-c-arrays) 31 char filename[C10_CUDA_DSA_MAX_STR_LEN]{}; 32 /// Name of the function the assertion was in 33 // NOLINTNEXTLINE(*-c-arrays) 34 char function_name[C10_CUDA_DSA_MAX_STR_LEN]{}; 35 /// Line number the assertion was at 36 int line_number{}; 37 /// Number uniquely identifying the kernel launch that triggered the assertion 38 uint32_t caller{}; 39 /// block_id of the thread that failed the assertion 40 // NOLINTNEXTLINE(*-c-arrays) 41 int32_t block_id[3]{}; 42 /// third_id of the thread that failed the assertion 43 // NOLINTNEXTLINE(*-c-arrays) 44 int32_t thread_id[3]{}; 45 }; 46 47 /// Used to hold assertions generated by the device 48 /// Held in managed memory and access by both the CPU and the GPU. 49 struct DeviceAssertionsData { 50 /// Total number of assertions found; a subset of thse will be recorded 51 /// in `assertions` 52 int32_t assertion_count{}; 53 /// An array of assertions that will be written to in a race-free manner 54 // NOLINTNEXTLINE(*-c-arrays) 55 DeviceAssertionData assertions[C10_CUDA_DSA_ASSERTION_COUNT]{}; 56 }; 57 58 /// Use to hold info about kernel launches so that we can run kernels 59 /// asynchronously and still associate launches with device-side 60 /// assertion failures 61 struct CUDAKernelLaunchInfo { 62 /// Filename of the code where the kernel was launched from 63 const char* launch_filename; 64 /// Function from which the kernel was launched 65 const char* launch_function; 66 /// Line number of where the code was launched from 67 uint32_t launch_linenum; 68 /// Backtrace of where the kernel was launched from, only populated if 69 /// CUDAKernelLaunchRegistry::gather_launch_stacktrace is True 70 std::string launch_stacktrace; 71 /// Kernel that was launched 72 const char* kernel_name; 73 /// Device the kernel was launched on 74 int device; 75 /// Stream the kernel was launched on 76 int32_t stream; 77 /// A number that uniquely identifies the kernel launch 78 uint64_t generation_number; 79 }; 80 81 /// Circular buffer used to hold information about kernel launches 82 /// this is later used to reconstruct how a device-side kernel assertion failure 83 /// occurred CUDAKernelLaunchRegistry is used as a singleton 84 class C10_CUDA_API CUDAKernelLaunchRegistry { 85 private: 86 /// Assume that this is the max number of kernel launches that might ever be 87 /// enqueued across all streams on a single device 88 static constexpr int max_kernel_launches = 1024; 89 /// How many kernel launch infos we've inserted. Used to ensure that circular 90 /// queue doesn't provide false information by always increasing, but also to 91 /// mark where we are inserting into the queue 92 #ifdef TORCH_USE_CUDA_DSA 93 uint64_t generation_number = 0; 94 #endif 95 /// Shared mutex between writer and accessor to ensure multi-threaded safety. 96 mutable std::mutex read_write_mutex; 97 /// Used to ensure prevent race conditions in GPU memory allocation 98 mutable std::mutex gpu_alloc_mutex; 99 /// Pointer to managed memory keeping track of device-side assertions. There 100 /// is one entry for each possible device the process might work with. Unused 101 /// entries are nullptrs. We could also use an unordered_set here, but this 102 /// vector design will be faster and the wasted memory is small since we 103 /// expect the number of GPUs per node will always be small 104 std::vector< 105 std::unique_ptr<DeviceAssertionsData, void (*)(DeviceAssertionsData*)>> 106 uvm_assertions; 107 /// A single circular buffer holds information about every kernel launch the 108 /// process makes across all devices. 109 std::vector<CUDAKernelLaunchInfo> kernel_launches; 110 bool check_env_for_enable_launch_stacktracing() const; 111 bool check_env_for_dsa_enabled() const; 112 113 public: 114 CUDAKernelLaunchRegistry(); 115 /// Register a new kernel launch and obtain a generation number back to be 116 /// passed to the kernel 117 uint32_t insert( 118 const char* launch_filename, 119 const char* launch_function, 120 const uint32_t launch_linenum, 121 const char* kernel_name, 122 const int32_t stream_id); 123 /// Get copies of the kernel launch registry and each device's assertion 124 /// failure buffer so they can be inspected without raising race conditions 125 std:: 126 pair<std::vector<DeviceAssertionsData>, std::vector<CUDAKernelLaunchInfo>> 127 snapshot() const; 128 /// Get a pointer to the current device's assertion failure buffer. If no such 129 /// buffer exists then one is created. This means that the first kernel launch 130 /// made on each device will be slightly slower because memory allocations are 131 /// required 132 DeviceAssertionsData* get_uvm_assertions_ptr_for_current_device(); 133 /// Gets the global singleton of the registry 134 static CUDAKernelLaunchRegistry& get_singleton_ref(); 135 /// If not all devices support DSA, we disable it 136 const bool do_all_devices_support_managed_memory = false; 137 /// Whether or not to gather stack traces when launching kernels 138 bool gather_launch_stacktrace = false; 139 /// Whether or not host-side DSA is enabled or disabled at run-time 140 /// Note: Device-side code cannot be enabled/disabled at run-time 141 bool enabled_at_runtime = false; 142 /// Whether or not a device has indicated a failure 143 bool has_failed() const; 144 #ifdef TORCH_USE_CUDA_DSA 145 const bool enabled_at_compile_time = true; 146 #else 147 const bool enabled_at_compile_time = false; 148 #endif 149 }; 150 151 std::string c10_retrieve_device_side_assertion_info(); 152 153 } // namespace c10::cuda 154 155 // Each kernel launched with TORCH_DSA_KERNEL_LAUNCH 156 // requires the same input arguments. We introduce the following macro to 157 // standardize these. 158 #define TORCH_DSA_KERNEL_ARGS \ 159 [[maybe_unused]] c10::cuda::DeviceAssertionsData *const assertions_data, \ 160 [[maybe_unused]] uint32_t assertion_caller_id 161 162 // This macro can be used to pass the DSA arguments onward to another 163 // function 164 #define TORCH_DSA_KERNEL_ARGS_PASS assertions_data, assertion_caller_id 165