xref: /aosp_15_r20/external/executorch/runtime/platform/profiler.h (revision 523fa7a60841cd1ecfb9cc4201f1ca8b03ed023a)
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD-style license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 #pragma once
10 
11 #include <stdint.h>
12 #include <stdlib.h>
13 #include <string.h>
14 
15 namespace executorch {
16 namespace runtime {
17 
18 // Version string used to check for compatibility with post-processing
19 // tool
20 #define ET_PROF_VER 0x00000001
21 
22 // By default we support profiling upto 1024 perf events. Build
23 // targets can override this to increase the profiling buffer size
24 // during compilation.
25 #ifndef MAX_PROFILE_EVENTS
26 #define MAX_PROFILE_EVENTS 1024
27 #endif
28 // By default we support profiling upto 1024 memory allocation events.
29 // Build targets can choose to override this, which will consequently have
30 // the effect of increasing/decreasing the profiling buffer size.
31 #ifndef MAX_MEM_PROFILE_EVENTS
32 #define MAX_MEM_PROFILE_EVENTS 1024
33 #endif
34 // By default we support profiling only upto 16 allocators. If users
35 // have more allocators than these then they can override this during
36 // compilation time. There will be an increase/decrease in the profiling
37 // buffer size based on the way this value is changed.
38 #ifndef MEM_PROFILE_MAX_ALLOCATORS
39 #define MEM_PROFILE_MAX_ALLOCATORS 32
40 #endif
41 // By default we support only one profiling block. If users want to profile
42 // something that will be iterated on multiple times then they will have to
43 // increment this to support their use case. In post-processing the stats for
44 // all these iterations will be consolidated.
45 #ifndef MAX_PROFILE_BLOCKS
46 #define MAX_PROFILE_BLOCKS 2
47 #endif
48 
49 #define PROF_NAME_MAX_LEN 32
50 
51 typedef struct alignas(8) {
52   union {
53     const char* name_str;
54     char name[PROF_NAME_MAX_LEN];
55   };
56   // chain_idx == -1 is a null value, when profile event happens out of chain
57   // execution
58   int32_t chain_idx;
59   uint32_t instruction_idx;
60   uint64_t start_time;
61   uint64_t end_time;
62 } prof_event_t;
63 
64 typedef struct alignas(8) {
65   uint32_t allocator_id;
66   uint32_t allocation_size;
67 } mem_prof_event_t;
68 
69 typedef struct alignas(8) {
70   char name[PROF_NAME_MAX_LEN];
71   uint64_t allocator_id;
72 } prof_allocator_t;
73 
74 typedef struct alignas(8) {
75   uint8_t* prof_data;
76   uint32_t num_bytes;
77   uint32_t num_blocks;
78 } prof_result_t;
79 
80 typedef struct alignas(8) {
81   char name[32];
82   uint32_t prof_ver;
83   uint32_t max_prof_entries;
84   uint32_t prof_entries;
85   uint32_t max_allocator_entries;
86   uint32_t allocator_entries;
87   uint32_t max_mem_prof_entries;
88   uint32_t mem_prof_entries;
89 } prof_header_t;
90 
91 /*
92 This is what the layout of the profiling buffer looks like.
93 ---------------------------------------
94 | Profiling header                    |
95 ---------------------------------------
96 | Profile events (Perf events)        |
97 ---------------------------------------
98 | Memory allocators info              |
99 ---------------------------------------
100 | Profile events (Memory allocations) |
101 ---------------------------------------
102 */
103 
104 // offsets of the various sections in the profiling buffer
105 // Total size required for profiling buffer
106 constexpr uint32_t prof_buf_size = sizeof(prof_header_t) +
107     sizeof(prof_event_t) * MAX_PROFILE_EVENTS +
108     sizeof(mem_prof_event_t) * MAX_MEM_PROFILE_EVENTS +
109     sizeof(prof_allocator_t) * MEM_PROFILE_MAX_ALLOCATORS;
110 
111 constexpr size_t prof_header_offset = 0;
112 constexpr size_t prof_events_offset = sizeof(prof_header_t);
113 constexpr size_t prof_mem_alloc_info_offset =
114     prof_events_offset + sizeof(prof_event_t) * MAX_PROFILE_EVENTS;
115 constexpr size_t prof_mem_alloc_events_offset = prof_mem_alloc_info_offset +
116     sizeof(prof_allocator_t) * MEM_PROFILE_MAX_ALLOCATORS;
117 
118 // Set the initial state for the profiler assuming we're using the
119 // statically allocated buffer declared in the profiler module.
120 void profiler_init(void);
121 
122 // This starts the profiling of this event and returns a token
123 // by which this event can be referred to in the future.
124 uint32_t begin_profiling(const char* name);
125 
126 // End profiling event represented by token_id
127 void end_profiling(uint32_t token_id);
128 
129 // Dump profiler results, return pointer to prof event array and number of
130 // events in it.
131 void dump_profile_stats(prof_result_t* prof_result);
132 
133 void reset_profile_stats();
134 
135 void track_allocation(int32_t id, uint32_t size);
136 
137 uint32_t track_allocator(const char* name);
138 
139 void profiling_create_block(const char* name);
140 
141 // This class enables scope based profiling where needed. Profiling
142 // will be started when the object is created and will end when the
143 // object goes out of scope.
144 class ExecutorchProfiler {
145  public:
146   explicit ExecutorchProfiler(const char* name);
147 
148   ~ExecutorchProfiler();
149 
150  private:
151   uint32_t prof_tok;
152 };
153 
154 typedef struct {
155   int32_t chain_idx;
156   uint32_t instruction_idx;
157 } prof_state_t;
158 
159 const prof_state_t& get_profile_tls_state();
160 
161 void set_profile_tls_state(const prof_state_t& state);
162 
163 class ExecutorchProfilerInstructionScope {
164  public:
165   explicit ExecutorchProfilerInstructionScope(const prof_state_t& state);
166   ~ExecutorchProfilerInstructionScope();
167 
168   // ScopeGuard: non-copyable, non-movable
169   ExecutorchProfilerInstructionScope(
170       const ExecutorchProfilerInstructionScope&) = delete;
171   ExecutorchProfilerInstructionScope& operator=(
172       const ExecutorchProfilerInstructionScope&) = delete;
173 
174   ExecutorchProfilerInstructionScope(ExecutorchProfilerInstructionScope&&) =
175       delete;
176   ExecutorchProfilerInstructionScope& operator=(
177       ExecutorchProfilerInstructionScope&&) = delete;
178 
179  private:
180   prof_state_t old_state_;
181 };
182 
183 } // namespace runtime
184 } // namespace executorch
185 
186 namespace torch {
187 namespace executor {
188 // TODO(T197294990): Remove these deprecated aliases once all users have moved
189 // to the new `::executorch` namespaces.
190 using ::executorch::runtime::begin_profiling;
191 using ::executorch::runtime::dump_profile_stats;
192 using ::executorch::runtime::end_profiling;
193 using ::executorch::runtime::ExecutorchProfiler;
194 using ::executorch::runtime::ExecutorchProfilerInstructionScope;
195 using ::executorch::runtime::get_profile_tls_state;
196 using ::executorch::runtime::mem_prof_event_t;
197 using ::executorch::runtime::prof_allocator_t;
198 using ::executorch::runtime::prof_buf_size;
199 using ::executorch::runtime::prof_event_t;
200 using ::executorch::runtime::prof_events_offset;
201 using ::executorch::runtime::prof_header_offset;
202 using ::executorch::runtime::prof_header_t;
203 using ::executorch::runtime::prof_mem_alloc_events_offset;
204 using ::executorch::runtime::prof_mem_alloc_info_offset;
205 using ::executorch::runtime::prof_result_t;
206 using ::executorch::runtime::prof_state_t;
207 using ::executorch::runtime::profiler_init;
208 using ::executorch::runtime::profiling_create_block;
209 using ::executorch::runtime::reset_profile_stats;
210 using ::executorch::runtime::set_profile_tls_state;
211 using ::executorch::runtime::track_allocation;
212 using ::executorch::runtime::track_allocator;
213 } // namespace executor
214 } // namespace torch
215 
216 #ifdef PROFILING_ENABLED
217 
218 #define EXECUTORCH_PROFILE_CREATE_BLOCK(name) \
219   ::executorch::runtime::profiling_create_block(name);
220 
221 // Convenience macros to begin and end profiling. These can be inserted
222 // anywhere as it'll be ensured that for the prod builds these will
223 // essentially be noops.
224 #define EXECUTORCH_BEGIN_PROF(name) \
225   ::executorch::runtime::begin_profiling(name);
226 
227 #define EXECUTORCH_END_PROF(token_id) \
228   ::executorch::runtime::end_profiling(token_id);
229 
230 #define EXECUTORCH_SCOPE_PROF(name) \
231   ::executorch::runtime::ExecutorchProfiler profiler(name);
232 
233 #define EXECUTORCH_PROFILE_INSTRUCTION_SCOPE(chain_idx, instruction_idx) \
234   ::executorch::runtime::ExecutorchProfilerInstructionScope              \
235       __profiler_instruction_scope({chain_idx, instruction_idx});
236 
237 #define EXECUTORCH_DUMP_PROFILE_RESULTS(prof_result) \
238   ::executorch::runtime::dump_profile_stats(prof_result);
239 
240 #define EXECUTORCH_RESET_PROFILE_RESULTS() \
241   ::executorch::runtime::reset_profile_stats();
242 
243 #define EXECUTORCH_TRACK_ALLOCATOR(name) \
244   ::executorch::runtime::track_allocator(name);
245 
246 #define EXECUTORCH_TRACK_ALLOCATION(id, size) \
247   ::executorch::runtime::track_allocation(id, size);
248 
249 #else
250 
251 #define EXECUTORCH_PROFILE_CREATE_BLOCK(name) \
252   do {                                        \
253     (void)(name);                             \
254   } while (0)
255 
256 #define EXECUTORCH_BEGIN_PROF(name) \
257   {}
258 
259 #define EXECUTORCH_END_PROF(token_id) \
260   do {                                \
261     (void)(token_id);                 \
262   } while (0)
263 
264 #define EXECUTORCH_SCOPE_PROF(name) \
265   do {                              \
266     (void)(name);                   \
267   } while (0)
268 
269 #define EXECUTORCH_PROFILE_INSTRUCTION_SCOPE(chain_idx, instruction_idx) \
270   do {                                                                   \
271     (void)(chain_idx);                                                   \
272     (void)(instruction_idx);                                             \
273   } while (0)
274 
275 #define EXECUTORCH_DUMP_PROFILE_RESULTS(prof_result_test) \
276   memset(prof_result_test, 0, sizeof(::executorch::runtime::prof_result_t));
277 
278 #define EXECUTORCH_RESET_PROFILE_RESULTS() \
279   {}
280 
281 #define EXECUTORCH_TRACK_ALLOCATOR(name) ((void)(name), -1)
282 
283 #define EXECUTORCH_TRACK_ALLOCATION(id, size) \
284   do {                                        \
285     (void)(id);                               \
286     (void)(size);                             \
287   } while (0)
288 
289 #endif
290