1 #pragma once 2 3 #include <string> 4 #include <vector> 5 6 #include <torch/csrc/profiler/api.h> 7 #include <torch/csrc/profiler/events.h> 8 #include <torch/csrc/profiler/stubs/base.h> 9 #include <torch/csrc/profiler/util.h> 10 11 namespace torch { 12 13 namespace profiler::impl { 14 struct Result; 15 namespace kineto { 16 struct ActivityTraceWrapper; 17 } // namespace kineto 18 } // namespace profiler::impl 19 20 namespace autograd::profiler { 21 using experimental_event_t = std::shared_ptr<torch::profiler::impl::Result>; 22 using extra_meta_t = std::unordered_map<std::string, std::string>; 23 24 struct TORCH_API KinetoEvent { 25 KinetoEvent( 26 const std::shared_ptr<const torch::profiler::impl::Result>&, 27 const bool verbose); 28 29 uint64_t startThreadId() const; 30 uint64_t endThreadId() const; 31 uint8_t activityType() const; 32 uint64_t fwdThreadId() const; 33 bool hasShapes() const; 34 const c10::ArrayRef<std::vector<int64_t>> shapes() const; 35 bool hasTypes() const; 36 const c10::ArrayRef<std::string> dtypes() const; 37 bool hasConcreteInputs() const; 38 const c10::ArrayRef<c10::IValue> concreteInputs() const; 39 bool hasKwinputs() const; 40 const std::unordered_map<std::string, c10::IValue> kwinputs() const; 41 uint64_t flops() const; 42 int64_t sequenceNr() const; 43 bool hasStack() const; 44 const c10::ArrayRef<std::string> stack() const; 45 uint8_t scope() const; 46 bool hasModuleHierarchy() const; 47 const c10::ArrayRef<std::string> moduleHierarchy() const; 48 int64_t debugHandle() const; 49 std::string name() const; 50 c10::DeviceType deviceType() const; 51 int deviceIndex() const; 52 int64_t nBytes() const; 53 uint64_t startNs() const; 54 uint64_t endNs() const; 55 uint64_t durationNs() const; 56 bool isAsync() const; 57 uint64_t correlationId() const; 58 uint64_t linkedCorrelationId() const; 59 int64_t deviceResourceId() const; 60 std::string backend() const; 61 bool isPythonFunction() const; 62 int64_t cudaElapsedUs() const; 63 int64_t privateuse1ElapsedUs() const; 64 void getPerfEventCounters(torch::profiler::perf_counters_t&) const; 65 extra_meta_t extraMeta() const; 66 67 private: 68 torch::profiler::impl::ProfilerVoidEventStub fallbackStart() const; 69 torch::profiler::impl::ProfilerVoidEventStub fallbackEnd() const; 70 71 std::shared_ptr<const torch::profiler::impl::Result> result_; 72 std::vector<std::string> python_stack_; 73 74 // Copy fields from result so we can return ArrayRefs. 75 std::vector<std::vector<int64_t>> shapes_; 76 std::vector<std::string> dtypes_; 77 std::vector<c10::IValue> concrete_inputs_; 78 std::unordered_map<std::string, c10::IValue> kwinputs_; 79 }; 80 81 // Consolidating events returned directly from Kineto 82 // with events manually created by us (e.g. start/stop marks, 83 // memory allocation events) 84 struct TORCH_API ProfilerResult { 85 ProfilerResult(); 86 ProfilerResult( 87 uint64_t start_time, 88 std::vector<KinetoEvent> events, 89 std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>&& 90 trace, 91 std::vector<experimental_event_t>&& event_tree); 92 ~ProfilerResult(); 93 trace_start_nsProfilerResult94 uint64_t trace_start_ns() const { 95 return trace_start_ns_; 96 } 97 eventsProfilerResult98 const std::vector<KinetoEvent>& events() const { 99 return events_; 100 } 101 event_treeProfilerResult102 const std::vector<experimental_event_t>& event_tree() const { 103 return event_tree_; 104 } 105 106 void save(const std::string& path); 107 108 private: 109 uint64_t trace_start_ns_ = 0; 110 std::vector<KinetoEvent> events_; 111 std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper> trace_; 112 std::vector<experimental_event_t> event_tree_; 113 }; 114 115 /* 116 * This API is used by backends to record latency of events that 117 * happened in the backend but were not visible to pytorch runtime. 118 * For example, if part of the model is lowered to a dsp backend, then 119 * the execution of that part of the model is delegated to the backend. 120 * When backend finishes execution it has an option to provide profiling 121 * information (latency only at the moment) corresponding to different operators 122 * that were executed in the backend. 123 * When such events are recorded by backend using this API, the event 124 * records will be collected by active kineto profiler. If no kineto profiler 125 * is active then the event is ignored. 126 * This provides us with a way to generate all the profiling information 127 * for a model regardless of where model (or part of it) executed. 128 * @param start_time_us: start time in us of the event 129 * @param end_time_us: end time in us of the event 130 * @param debug_handle: debug handle to correlate this event/op with 131 * model level module/source information 132 * @param scope: scope of the event, e.g. LITE_INTERPRETER, RECORD_FN etc. 133 * @param event_name: name of the event, e.g. op name 134 * @param backend_name: name of the backend where the event took place. 135 */ 136 TORCH_API void reportBackendEventToActiveKinetoProfiler( 137 const int64_t start_time_us, 138 const int64_t end_time_us, 139 const int64_t debug_handle, 140 const at::RecordScope scope, 141 const std::string& event_name, 142 const std::string& backend_name); 143 144 TORCH_API void enableProfiler( 145 const torch::profiler::impl::ProfilerConfig& config, 146 const std::set<torch::profiler::impl::ActivityType>& activities, 147 const std::unordered_set<at::RecordScope>& scopes = {}); 148 149 /* 150 * Same as enableProfiler but with callback to do post-processing of 151 * KinetoEvents. 152 * enableProfilerWithEventPostProcess enables profiler to capture 153 * specified activities, with specified RecordFunction scope, if any. 154 * Additionally, it takes a functor that does in-place post processing of 155 * events, e.g. populate stack trace or module hierarchy information lazily 156 * using debug_handle. 157 * Example usage is with lite interpreter that has recording scope of 158 * LITE_INTERPRETER. In this case lite interpreter runtime, records debug 159 * handles in RecordFunction, along with other information. Debug handles are 160 * eventually passed down to KinetoEvent and recorded as part of the event. 161 * KinetoEdgeCPUProfiler, in torch/csrc/jit/mobile/profiler_edge.cpp, enables 162 * profiler using post-processing callback, via 163 * enableProfilerWithEventPostProcess, that takes these debug handles and 164 * generates stack trace and module hierarchy information, once profiling is 165 * done. 166 */ 167 using post_process_t = std::function<void( 168 /*debug_handle */ int64_t, 169 /*jit_stack */ std::vector<std::string>&, 170 /*jit_modules */ std::vector<std::string>&)>; 171 TORCH_API void enableProfilerWithEventPostProcess( 172 const torch::profiler::impl::ProfilerConfig& config, 173 const std::set<torch::profiler::impl::ActivityType>& activities, 174 post_process_t&& cb, 175 const std::unordered_set<at::RecordScope>& scopes = {}); 176 177 TORCH_API std::unique_ptr<ProfilerResult> disableProfiler(); 178 179 TORCH_API void prepareProfiler( 180 const torch::profiler::impl::ProfilerConfig& config, 181 const std::set<torch::profiler::impl::ActivityType>& activities); 182 183 TORCH_API void toggleCollectionDynamic( 184 const bool enable, 185 const std::set<torch::profiler::impl::ActivityType>& activities); 186 187 /** 188 * When a C++ thread really has no control over how the profiler was enabled, 189 * for example, by some unreachable Python code, it can call these functions 190 * to test/join/unjoin itself into the collection set of a profiler, if any. 191 * Without calling these functions, the symptom may be "not seeing GPU events 192 * from some child C++ threads". This is an example on how to use them, 193 * 194 * using namespace torch::autograd::profiler; 195 * bool enabled = isProfilerEnabledInMainThread(); 196 * if (enabled != saved_enabled_state) { 197 * if (enabled) { 198 * enableProfilerInChildThread(); 199 * } else { 200 * disableProfilerInChildThread(); 201 * } 202 * saved_enabled_state = enabled; 203 * } 204 */ 205 TORCH_API bool isProfilerEnabledInMainThread(); 206 TORCH_API void enableProfilerInChildThread(); 207 TORCH_API void disableProfilerInChildThread(); 208 209 } // namespace autograd::profiler 210 211 namespace profiler::impl { 212 213 // Experimental. 214 TORCH_API void _reportVulkanEventToProfiler(vulkan_id_t id); 215 216 } // namespace profiler::impl 217 218 } // namespace torch 219