xref: /aosp_15_r20/external/pytorch/torch/csrc/autograd/profiler_kineto.h (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #pragma once
2 
3 #include <string>
4 #include <vector>
5 
6 #include <torch/csrc/profiler/api.h>
7 #include <torch/csrc/profiler/events.h>
8 #include <torch/csrc/profiler/stubs/base.h>
9 #include <torch/csrc/profiler/util.h>
10 
11 namespace torch {
12 
13 namespace profiler::impl {
14 struct Result;
15 namespace kineto {
16 struct ActivityTraceWrapper;
17 } // namespace kineto
18 } // namespace profiler::impl
19 
20 namespace autograd::profiler {
21 using experimental_event_t = std::shared_ptr<torch::profiler::impl::Result>;
22 using extra_meta_t = std::unordered_map<std::string, std::string>;
23 
24 struct TORCH_API KinetoEvent {
25   KinetoEvent(
26       const std::shared_ptr<const torch::profiler::impl::Result>&,
27       const bool verbose);
28 
29   uint64_t startThreadId() const;
30   uint64_t endThreadId() const;
31   uint8_t activityType() const;
32   uint64_t fwdThreadId() const;
33   bool hasShapes() const;
34   const c10::ArrayRef<std::vector<int64_t>> shapes() const;
35   bool hasTypes() const;
36   const c10::ArrayRef<std::string> dtypes() const;
37   bool hasConcreteInputs() const;
38   const c10::ArrayRef<c10::IValue> concreteInputs() const;
39   bool hasKwinputs() const;
40   const std::unordered_map<std::string, c10::IValue> kwinputs() const;
41   uint64_t flops() const;
42   int64_t sequenceNr() const;
43   bool hasStack() const;
44   const c10::ArrayRef<std::string> stack() const;
45   uint8_t scope() const;
46   bool hasModuleHierarchy() const;
47   const c10::ArrayRef<std::string> moduleHierarchy() const;
48   int64_t debugHandle() const;
49   std::string name() const;
50   c10::DeviceType deviceType() const;
51   int deviceIndex() const;
52   int64_t nBytes() const;
53   uint64_t startNs() const;
54   uint64_t endNs() const;
55   uint64_t durationNs() const;
56   bool isAsync() const;
57   uint64_t correlationId() const;
58   uint64_t linkedCorrelationId() const;
59   int64_t deviceResourceId() const;
60   std::string backend() const;
61   bool isPythonFunction() const;
62   int64_t cudaElapsedUs() const;
63   int64_t privateuse1ElapsedUs() const;
64   void getPerfEventCounters(torch::profiler::perf_counters_t&) const;
65   extra_meta_t extraMeta() const;
66 
67  private:
68   torch::profiler::impl::ProfilerVoidEventStub fallbackStart() const;
69   torch::profiler::impl::ProfilerVoidEventStub fallbackEnd() const;
70 
71   std::shared_ptr<const torch::profiler::impl::Result> result_;
72   std::vector<std::string> python_stack_;
73 
74   // Copy fields from result so we can return ArrayRefs.
75   std::vector<std::vector<int64_t>> shapes_;
76   std::vector<std::string> dtypes_;
77   std::vector<c10::IValue> concrete_inputs_;
78   std::unordered_map<std::string, c10::IValue> kwinputs_;
79 };
80 
81 // Consolidating events returned directly from Kineto
82 // with events manually created by us (e.g. start/stop marks,
83 // memory allocation events)
84 struct TORCH_API ProfilerResult {
85   ProfilerResult();
86   ProfilerResult(
87       uint64_t start_time,
88       std::vector<KinetoEvent> events,
89       std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>&&
90           trace,
91       std::vector<experimental_event_t>&& event_tree);
92   ~ProfilerResult();
93 
trace_start_nsProfilerResult94   uint64_t trace_start_ns() const {
95     return trace_start_ns_;
96   }
97 
eventsProfilerResult98   const std::vector<KinetoEvent>& events() const {
99     return events_;
100   }
101 
event_treeProfilerResult102   const std::vector<experimental_event_t>& event_tree() const {
103     return event_tree_;
104   }
105 
106   void save(const std::string& path);
107 
108  private:
109   uint64_t trace_start_ns_ = 0;
110   std::vector<KinetoEvent> events_;
111   std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper> trace_;
112   std::vector<experimental_event_t> event_tree_;
113 };
114 
115 /*
116  * This API is used by backends to record latency of events that
117  * happened in the backend but were not visible to pytorch runtime.
118  * For example, if part of the model is lowered to a dsp backend, then
119  * the execution of that part of the model is delegated to the backend.
120  * When backend finishes execution it has an option to provide profiling
121  * information (latency only at the moment) corresponding to different operators
122  * that were executed in the backend.
123  * When such events are recorded by backend using this API, the event
124  * records will be collected by active kineto profiler. If no kineto profiler
125  * is active then the event is ignored.
126  * This provides us with a way to generate all the profiling information
127  * for a model regardless of where model (or part of it) executed.
128  * @param start_time_us: start time in us of the event
129  * @param end_time_us: end time in us of the event
130  * @param debug_handle: debug handle to correlate this event/op with
131  * model level module/source information
132  * @param scope: scope of the event, e.g. LITE_INTERPRETER, RECORD_FN etc.
133  * @param event_name: name of the event, e.g. op name
134  * @param backend_name: name of the backend where the event took place.
135  */
136 TORCH_API void reportBackendEventToActiveKinetoProfiler(
137     const int64_t start_time_us,
138     const int64_t end_time_us,
139     const int64_t debug_handle,
140     const at::RecordScope scope,
141     const std::string& event_name,
142     const std::string& backend_name);
143 
144 TORCH_API void enableProfiler(
145     const torch::profiler::impl::ProfilerConfig& config,
146     const std::set<torch::profiler::impl::ActivityType>& activities,
147     const std::unordered_set<at::RecordScope>& scopes = {});
148 
149 /*
150  * Same as enableProfiler but with callback to do post-processing of
151  * KinetoEvents.
152  * enableProfilerWithEventPostProcess enables profiler to capture
153  * specified activities, with specified RecordFunction scope, if any.
154  * Additionally, it takes a functor that does in-place post processing of
155  * events, e.g. populate stack trace or module hierarchy information lazily
156  * using debug_handle.
157  * Example usage is with lite interpreter that has recording scope of
158  * LITE_INTERPRETER. In this case lite interpreter runtime, records debug
159  * handles in RecordFunction, along with other information. Debug handles are
160  * eventually passed down to KinetoEvent and recorded as part of the event.
161  * KinetoEdgeCPUProfiler, in torch/csrc/jit/mobile/profiler_edge.cpp, enables
162  * profiler using post-processing callback, via
163  * enableProfilerWithEventPostProcess, that takes these debug handles and
164  * generates stack trace and module hierarchy information, once profiling is
165  * done.
166  */
167 using post_process_t = std::function<void(
168     /*debug_handle */ int64_t,
169     /*jit_stack    */ std::vector<std::string>&,
170     /*jit_modules  */ std::vector<std::string>&)>;
171 TORCH_API void enableProfilerWithEventPostProcess(
172     const torch::profiler::impl::ProfilerConfig& config,
173     const std::set<torch::profiler::impl::ActivityType>& activities,
174     post_process_t&& cb,
175     const std::unordered_set<at::RecordScope>& scopes = {});
176 
177 TORCH_API std::unique_ptr<ProfilerResult> disableProfiler();
178 
179 TORCH_API void prepareProfiler(
180     const torch::profiler::impl::ProfilerConfig& config,
181     const std::set<torch::profiler::impl::ActivityType>& activities);
182 
183 TORCH_API void toggleCollectionDynamic(
184     const bool enable,
185     const std::set<torch::profiler::impl::ActivityType>& activities);
186 
187 /**
188  * When a C++ thread really has no control over how the profiler was enabled,
189  * for example, by some unreachable Python code, it can call these functions
190  * to test/join/unjoin itself into the collection set of a profiler, if any.
191  * Without calling these functions, the symptom may be "not seeing GPU events
192  * from some child C++ threads". This is an example on how to use them,
193  *
194  *    using namespace torch::autograd::profiler;
195  *    bool enabled = isProfilerEnabledInMainThread();
196  *    if (enabled != saved_enabled_state) {
197  *      if (enabled) {
198  *        enableProfilerInChildThread();
199  *      } else {
200  *        disableProfilerInChildThread();
201  *      }
202  *      saved_enabled_state = enabled;
203  *    }
204  */
205 TORCH_API bool isProfilerEnabledInMainThread();
206 TORCH_API void enableProfilerInChildThread();
207 TORCH_API void disableProfilerInChildThread();
208 
209 } // namespace autograd::profiler
210 
211 namespace profiler::impl {
212 
213 // Experimental.
214 TORCH_API void _reportVulkanEventToProfiler(vulkan_id_t id);
215 
216 } // namespace profiler::impl
217 
218 } // namespace torch
219