xref: /aosp_15_r20/external/pytorch/c10/cuda/CUDADeviceAssertionHost.h (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #pragma once
2 
3 #include <c10/cuda/CUDAMacros.h>
4 
5 #include <cstdint>
6 #include <memory>
7 #include <mutex>
8 #include <string>
9 #include <utility>
10 #include <vector>
11 
12 #ifdef USE_CUDA
13 #define TORCH_USE_CUDA_DSA
14 #endif
15 
16 /// Number of assertion failure messages we can store. If this is too small
17 /// threads will fail silently.
18 constexpr int C10_CUDA_DSA_ASSERTION_COUNT = 10;
19 constexpr int C10_CUDA_DSA_MAX_STR_LEN = 512;
20 
21 namespace c10::cuda {
22 
23 /// Holds information about any device-side assertions that fail.
24 /// Held in managed memory and access by both the CPU and the GPU.
25 struct DeviceAssertionData {
26   /// Stringification of the assertion
27   // NOLINTNEXTLINE(*-c-arrays)
28   char assertion_msg[C10_CUDA_DSA_MAX_STR_LEN]{};
29   /// File the assertion was in
30   // NOLINTNEXTLINE(*-c-arrays)
31   char filename[C10_CUDA_DSA_MAX_STR_LEN]{};
32   /// Name of the function the assertion was in
33   // NOLINTNEXTLINE(*-c-arrays)
34   char function_name[C10_CUDA_DSA_MAX_STR_LEN]{};
35   /// Line number the assertion was at
36   int line_number{};
37   /// Number uniquely identifying the kernel launch that triggered the assertion
38   uint32_t caller{};
39   /// block_id of the thread that failed the assertion
40   // NOLINTNEXTLINE(*-c-arrays)
41   int32_t block_id[3]{};
42   /// third_id of the thread that failed the assertion
43   // NOLINTNEXTLINE(*-c-arrays)
44   int32_t thread_id[3]{};
45 };
46 
47 /// Used to hold assertions generated by the device
48 /// Held in managed memory and access by both the CPU and the GPU.
49 struct DeviceAssertionsData {
50   /// Total number of assertions found; a subset of thse will be recorded
51   /// in `assertions`
52   int32_t assertion_count{};
53   /// An array of assertions that will be written to in a race-free manner
54   // NOLINTNEXTLINE(*-c-arrays)
55   DeviceAssertionData assertions[C10_CUDA_DSA_ASSERTION_COUNT]{};
56 };
57 
58 /// Use to hold info about kernel launches so that we can run kernels
59 /// asynchronously and still associate launches with device-side
60 /// assertion failures
61 struct CUDAKernelLaunchInfo {
62   /// Filename of the code where the kernel was launched from
63   const char* launch_filename;
64   /// Function from which the kernel was launched
65   const char* launch_function;
66   /// Line number of where the code was launched from
67   uint32_t launch_linenum;
68   /// Backtrace of where the kernel was launched from, only populated if
69   /// CUDAKernelLaunchRegistry::gather_launch_stacktrace is True
70   std::string launch_stacktrace;
71   /// Kernel that was launched
72   const char* kernel_name;
73   /// Device the kernel was launched on
74   int device;
75   /// Stream the kernel was launched on
76   int32_t stream;
77   /// A number that uniquely identifies the kernel launch
78   uint64_t generation_number;
79 };
80 
81 /// Circular buffer used to hold information about kernel launches
82 /// this is later used to reconstruct how a device-side kernel assertion failure
83 /// occurred CUDAKernelLaunchRegistry is used as a singleton
84 class C10_CUDA_API CUDAKernelLaunchRegistry {
85  private:
86   /// Assume that this is the max number of kernel launches that might ever be
87   /// enqueued across all streams on a single device
88   static constexpr int max_kernel_launches = 1024;
89   /// How many kernel launch infos we've inserted. Used to ensure that circular
90   /// queue doesn't provide false information by always increasing, but also to
91   /// mark where we are inserting into the queue
92 #ifdef TORCH_USE_CUDA_DSA
93   uint64_t generation_number = 0;
94 #endif
95   /// Shared mutex between writer and accessor to ensure multi-threaded safety.
96   mutable std::mutex read_write_mutex;
97   /// Used to ensure prevent race conditions in GPU memory allocation
98   mutable std::mutex gpu_alloc_mutex;
99   /// Pointer to managed memory keeping track of device-side assertions. There
100   /// is one entry for each possible device the process might work with. Unused
101   /// entries are nullptrs. We could also use an unordered_set here, but this
102   /// vector design will be faster and the wasted memory is small since we
103   /// expect the number of GPUs per node will always be small
104   std::vector<
105       std::unique_ptr<DeviceAssertionsData, void (*)(DeviceAssertionsData*)>>
106       uvm_assertions;
107   /// A single circular buffer holds information about every kernel launch the
108   /// process makes across all devices.
109   std::vector<CUDAKernelLaunchInfo> kernel_launches;
110   bool check_env_for_enable_launch_stacktracing() const;
111   bool check_env_for_dsa_enabled() const;
112 
113  public:
114   CUDAKernelLaunchRegistry();
115   /// Register a new kernel launch and obtain a generation number back to be
116   /// passed to the kernel
117   uint32_t insert(
118       const char* launch_filename,
119       const char* launch_function,
120       const uint32_t launch_linenum,
121       const char* kernel_name,
122       const int32_t stream_id);
123   /// Get copies of the kernel launch registry and each device's assertion
124   /// failure buffer so they can be inspected without raising race conditions
125   std::
126       pair<std::vector<DeviceAssertionsData>, std::vector<CUDAKernelLaunchInfo>>
127       snapshot() const;
128   /// Get a pointer to the current device's assertion failure buffer. If no such
129   /// buffer exists then one is created. This means that the first kernel launch
130   /// made on each device will be slightly slower because memory allocations are
131   /// required
132   DeviceAssertionsData* get_uvm_assertions_ptr_for_current_device();
133   /// Gets the global singleton of the registry
134   static CUDAKernelLaunchRegistry& get_singleton_ref();
135   /// If not all devices support DSA, we disable it
136   const bool do_all_devices_support_managed_memory = false;
137   /// Whether or not to gather stack traces when launching kernels
138   bool gather_launch_stacktrace = false;
139   /// Whether or not host-side DSA is enabled or disabled at run-time
140   /// Note: Device-side code cannot be enabled/disabled at run-time
141   bool enabled_at_runtime = false;
142   /// Whether or not a device has indicated a failure
143   bool has_failed() const;
144 #ifdef TORCH_USE_CUDA_DSA
145   const bool enabled_at_compile_time = true;
146 #else
147   const bool enabled_at_compile_time = false;
148 #endif
149 };
150 
151 std::string c10_retrieve_device_side_assertion_info();
152 
153 } // namespace c10::cuda
154 
155 // Each kernel launched with TORCH_DSA_KERNEL_LAUNCH
156 // requires the same input arguments. We introduce the following macro to
157 // standardize these.
158 #define TORCH_DSA_KERNEL_ARGS                                              \
159   [[maybe_unused]] c10::cuda::DeviceAssertionsData *const assertions_data, \
160       [[maybe_unused]] uint32_t assertion_caller_id
161 
162 // This macro can be used to pass the DSA arguments onward to another
163 // function
164 #define TORCH_DSA_KERNEL_ARGS_PASS assertions_data, assertion_caller_id
165