1 /* 2 * Copyright (c) Meta Platforms, Inc. and affiliates. 3 * All rights reserved. 4 * 5 * This source code is licensed under the BSD-style license found in the 6 * LICENSE file in the root directory of this source tree. 7 */ 8 9 #pragma once 10 11 #include <stdint.h> 12 #include <stdlib.h> 13 #include <string.h> 14 15 namespace executorch { 16 namespace runtime { 17 18 // Version string used to check for compatibility with post-processing 19 // tool 20 #define ET_PROF_VER 0x00000001 21 22 // By default we support profiling upto 1024 perf events. Build 23 // targets can override this to increase the profiling buffer size 24 // during compilation. 25 #ifndef MAX_PROFILE_EVENTS 26 #define MAX_PROFILE_EVENTS 1024 27 #endif 28 // By default we support profiling upto 1024 memory allocation events. 29 // Build targets can choose to override this, which will consequently have 30 // the effect of increasing/decreasing the profiling buffer size. 31 #ifndef MAX_MEM_PROFILE_EVENTS 32 #define MAX_MEM_PROFILE_EVENTS 1024 33 #endif 34 // By default we support profiling only upto 16 allocators. If users 35 // have more allocators than these then they can override this during 36 // compilation time. There will be an increase/decrease in the profiling 37 // buffer size based on the way this value is changed. 38 #ifndef MEM_PROFILE_MAX_ALLOCATORS 39 #define MEM_PROFILE_MAX_ALLOCATORS 32 40 #endif 41 // By default we support only one profiling block. If users want to profile 42 // something that will be iterated on multiple times then they will have to 43 // increment this to support their use case. In post-processing the stats for 44 // all these iterations will be consolidated. 45 #ifndef MAX_PROFILE_BLOCKS 46 #define MAX_PROFILE_BLOCKS 2 47 #endif 48 49 #define PROF_NAME_MAX_LEN 32 50 51 typedef struct alignas(8) { 52 union { 53 const char* name_str; 54 char name[PROF_NAME_MAX_LEN]; 55 }; 56 // chain_idx == -1 is a null value, when profile event happens out of chain 57 // execution 58 int32_t chain_idx; 59 uint32_t instruction_idx; 60 uint64_t start_time; 61 uint64_t end_time; 62 } prof_event_t; 63 64 typedef struct alignas(8) { 65 uint32_t allocator_id; 66 uint32_t allocation_size; 67 } mem_prof_event_t; 68 69 typedef struct alignas(8) { 70 char name[PROF_NAME_MAX_LEN]; 71 uint64_t allocator_id; 72 } prof_allocator_t; 73 74 typedef struct alignas(8) { 75 uint8_t* prof_data; 76 uint32_t num_bytes; 77 uint32_t num_blocks; 78 } prof_result_t; 79 80 typedef struct alignas(8) { 81 char name[32]; 82 uint32_t prof_ver; 83 uint32_t max_prof_entries; 84 uint32_t prof_entries; 85 uint32_t max_allocator_entries; 86 uint32_t allocator_entries; 87 uint32_t max_mem_prof_entries; 88 uint32_t mem_prof_entries; 89 } prof_header_t; 90 91 /* 92 This is what the layout of the profiling buffer looks like. 93 --------------------------------------- 94 | Profiling header | 95 --------------------------------------- 96 | Profile events (Perf events) | 97 --------------------------------------- 98 | Memory allocators info | 99 --------------------------------------- 100 | Profile events (Memory allocations) | 101 --------------------------------------- 102 */ 103 104 // offsets of the various sections in the profiling buffer 105 // Total size required for profiling buffer 106 constexpr uint32_t prof_buf_size = sizeof(prof_header_t) + 107 sizeof(prof_event_t) * MAX_PROFILE_EVENTS + 108 sizeof(mem_prof_event_t) * MAX_MEM_PROFILE_EVENTS + 109 sizeof(prof_allocator_t) * MEM_PROFILE_MAX_ALLOCATORS; 110 111 constexpr size_t prof_header_offset = 0; 112 constexpr size_t prof_events_offset = sizeof(prof_header_t); 113 constexpr size_t prof_mem_alloc_info_offset = 114 prof_events_offset + sizeof(prof_event_t) * MAX_PROFILE_EVENTS; 115 constexpr size_t prof_mem_alloc_events_offset = prof_mem_alloc_info_offset + 116 sizeof(prof_allocator_t) * MEM_PROFILE_MAX_ALLOCATORS; 117 118 // Set the initial state for the profiler assuming we're using the 119 // statically allocated buffer declared in the profiler module. 120 void profiler_init(void); 121 122 // This starts the profiling of this event and returns a token 123 // by which this event can be referred to in the future. 124 uint32_t begin_profiling(const char* name); 125 126 // End profiling event represented by token_id 127 void end_profiling(uint32_t token_id); 128 129 // Dump profiler results, return pointer to prof event array and number of 130 // events in it. 131 void dump_profile_stats(prof_result_t* prof_result); 132 133 void reset_profile_stats(); 134 135 void track_allocation(int32_t id, uint32_t size); 136 137 uint32_t track_allocator(const char* name); 138 139 void profiling_create_block(const char* name); 140 141 // This class enables scope based profiling where needed. Profiling 142 // will be started when the object is created and will end when the 143 // object goes out of scope. 144 class ExecutorchProfiler { 145 public: 146 explicit ExecutorchProfiler(const char* name); 147 148 ~ExecutorchProfiler(); 149 150 private: 151 uint32_t prof_tok; 152 }; 153 154 typedef struct { 155 int32_t chain_idx; 156 uint32_t instruction_idx; 157 } prof_state_t; 158 159 const prof_state_t& get_profile_tls_state(); 160 161 void set_profile_tls_state(const prof_state_t& state); 162 163 class ExecutorchProfilerInstructionScope { 164 public: 165 explicit ExecutorchProfilerInstructionScope(const prof_state_t& state); 166 ~ExecutorchProfilerInstructionScope(); 167 168 // ScopeGuard: non-copyable, non-movable 169 ExecutorchProfilerInstructionScope( 170 const ExecutorchProfilerInstructionScope&) = delete; 171 ExecutorchProfilerInstructionScope& operator=( 172 const ExecutorchProfilerInstructionScope&) = delete; 173 174 ExecutorchProfilerInstructionScope(ExecutorchProfilerInstructionScope&&) = 175 delete; 176 ExecutorchProfilerInstructionScope& operator=( 177 ExecutorchProfilerInstructionScope&&) = delete; 178 179 private: 180 prof_state_t old_state_; 181 }; 182 183 } // namespace runtime 184 } // namespace executorch 185 186 namespace torch { 187 namespace executor { 188 // TODO(T197294990): Remove these deprecated aliases once all users have moved 189 // to the new `::executorch` namespaces. 190 using ::executorch::runtime::begin_profiling; 191 using ::executorch::runtime::dump_profile_stats; 192 using ::executorch::runtime::end_profiling; 193 using ::executorch::runtime::ExecutorchProfiler; 194 using ::executorch::runtime::ExecutorchProfilerInstructionScope; 195 using ::executorch::runtime::get_profile_tls_state; 196 using ::executorch::runtime::mem_prof_event_t; 197 using ::executorch::runtime::prof_allocator_t; 198 using ::executorch::runtime::prof_buf_size; 199 using ::executorch::runtime::prof_event_t; 200 using ::executorch::runtime::prof_events_offset; 201 using ::executorch::runtime::prof_header_offset; 202 using ::executorch::runtime::prof_header_t; 203 using ::executorch::runtime::prof_mem_alloc_events_offset; 204 using ::executorch::runtime::prof_mem_alloc_info_offset; 205 using ::executorch::runtime::prof_result_t; 206 using ::executorch::runtime::prof_state_t; 207 using ::executorch::runtime::profiler_init; 208 using ::executorch::runtime::profiling_create_block; 209 using ::executorch::runtime::reset_profile_stats; 210 using ::executorch::runtime::set_profile_tls_state; 211 using ::executorch::runtime::track_allocation; 212 using ::executorch::runtime::track_allocator; 213 } // namespace executor 214 } // namespace torch 215 216 #ifdef PROFILING_ENABLED 217 218 #define EXECUTORCH_PROFILE_CREATE_BLOCK(name) \ 219 ::executorch::runtime::profiling_create_block(name); 220 221 // Convenience macros to begin and end profiling. These can be inserted 222 // anywhere as it'll be ensured that for the prod builds these will 223 // essentially be noops. 224 #define EXECUTORCH_BEGIN_PROF(name) \ 225 ::executorch::runtime::begin_profiling(name); 226 227 #define EXECUTORCH_END_PROF(token_id) \ 228 ::executorch::runtime::end_profiling(token_id); 229 230 #define EXECUTORCH_SCOPE_PROF(name) \ 231 ::executorch::runtime::ExecutorchProfiler profiler(name); 232 233 #define EXECUTORCH_PROFILE_INSTRUCTION_SCOPE(chain_idx, instruction_idx) \ 234 ::executorch::runtime::ExecutorchProfilerInstructionScope \ 235 __profiler_instruction_scope({chain_idx, instruction_idx}); 236 237 #define EXECUTORCH_DUMP_PROFILE_RESULTS(prof_result) \ 238 ::executorch::runtime::dump_profile_stats(prof_result); 239 240 #define EXECUTORCH_RESET_PROFILE_RESULTS() \ 241 ::executorch::runtime::reset_profile_stats(); 242 243 #define EXECUTORCH_TRACK_ALLOCATOR(name) \ 244 ::executorch::runtime::track_allocator(name); 245 246 #define EXECUTORCH_TRACK_ALLOCATION(id, size) \ 247 ::executorch::runtime::track_allocation(id, size); 248 249 #else 250 251 #define EXECUTORCH_PROFILE_CREATE_BLOCK(name) \ 252 do { \ 253 (void)(name); \ 254 } while (0) 255 256 #define EXECUTORCH_BEGIN_PROF(name) \ 257 {} 258 259 #define EXECUTORCH_END_PROF(token_id) \ 260 do { \ 261 (void)(token_id); \ 262 } while (0) 263 264 #define EXECUTORCH_SCOPE_PROF(name) \ 265 do { \ 266 (void)(name); \ 267 } while (0) 268 269 #define EXECUTORCH_PROFILE_INSTRUCTION_SCOPE(chain_idx, instruction_idx) \ 270 do { \ 271 (void)(chain_idx); \ 272 (void)(instruction_idx); \ 273 } while (0) 274 275 #define EXECUTORCH_DUMP_PROFILE_RESULTS(prof_result_test) \ 276 memset(prof_result_test, 0, sizeof(::executorch::runtime::prof_result_t)); 277 278 #define EXECUTORCH_RESET_PROFILE_RESULTS() \ 279 {} 280 281 #define EXECUTORCH_TRACK_ALLOCATOR(name) ((void)(name), -1) 282 283 #define EXECUTORCH_TRACK_ALLOCATION(id, size) \ 284 do { \ 285 (void)(id); \ 286 (void)(size); \ 287 } while (0) 288 289 #endif 290