1 /* 2 * Copyright (c) Facebook, Inc. 3 * Licensed under the Apache License, Version 2.0 (the "License") 4 */ 5 6 #include <string> 7 8 namespace ebpf { 9 namespace pyperf { 10 11 extern const std::string PYPERF_BPF_PROGRAM = R"( 12 #include <linux/sched.h> 13 #include <uapi/linux/ptrace.h> 14 15 #define PYTHON_STACK_FRAMES_PER_PROG 25 16 #define PYTHON_STACK_PROG_CNT 3 17 #define STACK_MAX_LEN (PYTHON_STACK_FRAMES_PER_PROG * PYTHON_STACK_PROG_CNT) 18 #define CLASS_NAME_LEN 32 19 #define FUNCTION_NAME_LEN 64 20 #define FILE_NAME_LEN 128 21 #define TASK_COMM_LEN 16 22 23 enum { 24 STACK_STATUS_COMPLETE = 0, 25 STACK_STATUS_ERROR = 1, 26 STACK_STATUS_TRUNCATED = 2, 27 }; 28 29 enum { 30 GIL_STATE_NO_INFO = 0, 31 GIL_STATE_ERROR = 1, 32 GIL_STATE_UNINITIALIZED = 2, 33 GIL_STATE_NOT_LOCKED = 3, 34 GIL_STATE_THIS_THREAD = 4, 35 GIL_STATE_GLOBAL_CURRENT_THREAD = 5, 36 GIL_STATE_OTHER_THREAD = 6, 37 GIL_STATE_NULL = 7, 38 }; 39 40 enum { 41 THREAD_STATE_UNKNOWN = 0, 42 THREAD_STATE_MATCH = 1, 43 THREAD_STATE_MISMATCH = 2, 44 THREAD_STATE_THIS_THREAD_NULL = 3, 45 THREAD_STATE_GLOBAL_CURRENT_THREAD_NULL = 4, 46 THREAD_STATE_BOTH_NULL = 5, 47 }; 48 49 enum { 50 PTHREAD_ID_UNKNOWN = 0, 51 PTHREAD_ID_MATCH = 1, 52 PTHREAD_ID_MISMATCH = 2, 53 PTHREAD_ID_THREAD_STATE_NULL = 3, 54 PTHREAD_ID_NULL = 4, 55 PTHREAD_ID_ERROR = 5, 56 }; 57 58 typedef struct { 59 int64_t PyObject_type; 60 int64_t PyTypeObject_name; 61 int64_t PyThreadState_frame; 62 int64_t PyThreadState_thread; 63 int64_t PyFrameObject_back; 64 int64_t PyFrameObject_code; 65 int64_t PyFrameObject_lineno; 66 int64_t PyFrameObject_localsplus; 67 int64_t PyCodeObject_filename; 68 int64_t PyCodeObject_name; 69 int64_t PyCodeObject_varnames; 70 int64_t PyTupleObject_item; 71 int64_t String_data; 72 int64_t String_size; 73 } OffsetConfig; 74 75 typedef struct { 76 uintptr_t current_state_addr; // virtual address of _PyThreadState_Current 77 uintptr_t tls_key_addr; // virtual address of autoTLSkey for pthreads TLS 78 uintptr_t gil_locked_addr; // virtual address of gil_locked 79 uintptr_t gil_last_holder_addr; // virtual address of gil_last_holder 80 OffsetConfig offsets; 81 } PidData; 82 83 typedef struct { 84 char classname[CLASS_NAME_LEN]; 85 char name[FUNCTION_NAME_LEN]; 86 char file[FILE_NAME_LEN]; 87 // NOTE: PyFrameObject also has line number but it is typically just the 88 // first line of that function and PyCode_Addr2Line needs to be called 89 // to get the actual line 90 } Symbol; 91 92 typedef struct { 93 uint32_t pid; 94 uint32_t tid; 95 char comm[TASK_COMM_LEN]; 96 uint8_t thread_state_match; 97 uint8_t gil_state; 98 uint8_t pthread_id_match; 99 uint8_t stack_status; 100 // instead of storing symbol name here directly, we add it to another 101 // hashmap with Symbols and only store the ids here 102 int64_t stack_len; 103 int32_t stack[STACK_MAX_LEN]; 104 } Event; 105 106 #define _STR_CONCAT(str1, str2) str1##str2 107 #define STR_CONCAT(str1, str2) _STR_CONCAT(str1, str2) 108 #define FAIL_COMPILATION_IF(condition) \ 109 typedef struct { \ 110 char _condition_check[1 - 2 * !!(condition)]; \ 111 } STR_CONCAT(compile_time_condition_check, __COUNTER__); 112 // See comments in get_frame_data 113 FAIL_COMPILATION_IF(sizeof(Symbol) == sizeof(struct bpf_perf_event_value)) 114 115 typedef struct { 116 OffsetConfig offsets; 117 uint64_t cur_cpu; 118 int64_t symbol_counter; 119 void* frame_ptr; 120 int64_t python_stack_prog_call_cnt; 121 Event event; 122 } sample_state_t; 123 124 BPF_PERCPU_ARRAY(state_heap, sample_state_t, 1); 125 BPF_HASH(symbols, Symbol, int32_t, __SYMBOLS_SIZE__); 126 BPF_HASH(pid_config, pid_t, PidData); 127 BPF_PROG_ARRAY(progs, 1); 128 129 BPF_PERF_OUTPUT(events); 130 131 static inline __attribute__((__always_inline__)) void* get_thread_state( 132 void* tls_base, 133 PidData* pid_data) { 134 // Python sets the thread_state using pthread_setspecific with the key 135 // stored in a global variable autoTLSkey. 136 // We read the value of the key from the global variable and then read 137 // the value in the thread-local storage. This relies on pthread implementation. 138 // This is basically the same as running the following in GDB: 139 // p *(PyThreadState*)((struct pthread*)pthread_self())-> 140 // specific_1stblock[autoTLSkey]->data 141 int key; 142 bpf_probe_read_user(&key, sizeof(key), (void*)pid_data->tls_key_addr); 143 // This assumes autoTLSkey < 32, which means that the TLS is stored in 144 // pthread->specific_1stblock[autoTLSkey] 145 // 0x310 is offsetof(struct pthread, specific_1stblock), 146 // 0x10 is sizeof(pthread_key_data) 147 // 0x8 is offsetof(struct pthread_key_data, data) 148 // 'struct pthread' is not in the public API so we have to hardcode 149 // the offsets here 150 void* thread_state; 151 bpf_probe_read_user( 152 &thread_state, 153 sizeof(thread_state), 154 tls_base + 0x310 + key * 0x10 + 0x08); 155 return thread_state; 156 } 157 158 static inline __attribute__((__always_inline__)) int submit_sample( 159 struct pt_regs* ctx, 160 sample_state_t* state) { 161 events.perf_submit(ctx, &state->event, sizeof(Event)); 162 return 0; 163 } 164 165 // this function is trivial, but we need to do map lookup in separate function, 166 // because BCC doesn't allow direct map calls (including lookups) from inside 167 // a macro (which we want to do in GET_STATE() macro below) 168 static inline __attribute__((__always_inline__)) sample_state_t* get_state() { 169 int zero = 0; 170 return state_heap.lookup(&zero); 171 } 172 173 #define GET_STATE() \ 174 sample_state_t* state = get_state(); \ 175 if (!state) { \ 176 return 0; /* should never happen */ \ 177 } 178 179 static inline __attribute__((__always_inline__)) int get_thread_state_match( 180 void* this_thread_state, 181 void* global_thread_state) { 182 if (this_thread_state == 0 && global_thread_state == 0) { 183 return THREAD_STATE_BOTH_NULL; 184 } 185 if (this_thread_state == 0) { 186 return THREAD_STATE_THIS_THREAD_NULL; 187 } 188 if (global_thread_state == 0) { 189 return THREAD_STATE_GLOBAL_CURRENT_THREAD_NULL; 190 } 191 if (this_thread_state == global_thread_state) { 192 return THREAD_STATE_MATCH; 193 } else { 194 return THREAD_STATE_MISMATCH; 195 } 196 } 197 198 static inline __attribute__((__always_inline__)) int get_gil_state( 199 void* this_thread_state, 200 void* global_thread_state, 201 PidData* pid_data) { 202 // Get information of GIL state 203 if (pid_data->gil_locked_addr == 0 || pid_data->gil_last_holder_addr == 0) { 204 return GIL_STATE_NO_INFO; 205 } 206 207 int gil_locked = 0; 208 void* gil_thread_state = 0; 209 if (bpf_probe_read_user( 210 &gil_locked, sizeof(gil_locked), (void*)pid_data->gil_locked_addr)) { 211 return GIL_STATE_ERROR; 212 } 213 214 switch (gil_locked) { 215 case -1: 216 return GIL_STATE_UNINITIALIZED; 217 case 0: 218 return GIL_STATE_NOT_LOCKED; 219 case 1: 220 // GIL is held by some Thread 221 bpf_probe_read_user( 222 &gil_thread_state, 223 sizeof(void*), 224 (void*)pid_data->gil_last_holder_addr); 225 if (gil_thread_state == this_thread_state) { 226 return GIL_STATE_THIS_THREAD; 227 } else if (gil_thread_state == global_thread_state) { 228 return GIL_STATE_GLOBAL_CURRENT_THREAD; 229 } else if (gil_thread_state == 0) { 230 return GIL_STATE_NULL; 231 } else { 232 return GIL_STATE_OTHER_THREAD; 233 } 234 default: 235 return GIL_STATE_ERROR; 236 } 237 } 238 239 static inline __attribute__((__always_inline__)) int 240 get_pthread_id_match(void* thread_state, void* tls_base, PidData* pid_data) { 241 if (thread_state == 0) { 242 return PTHREAD_ID_THREAD_STATE_NULL; 243 } 244 245 uint64_t pthread_self, pthread_created; 246 247 bpf_probe_read_user( 248 &pthread_created, 249 sizeof(pthread_created), 250 thread_state + pid_data->offsets.PyThreadState_thread); 251 if (pthread_created == 0) { 252 return PTHREAD_ID_NULL; 253 } 254 255 // 0x10 = offsetof(struct pthread, header.self) 256 bpf_probe_read_user(&pthread_self, sizeof(pthread_self), tls_base + 0x10); 257 if (pthread_self == 0) { 258 return PTHREAD_ID_ERROR; 259 } 260 261 if (pthread_self == pthread_created) { 262 return PTHREAD_ID_MATCH; 263 } else { 264 return PTHREAD_ID_MISMATCH; 265 } 266 } 267 268 int on_event(struct pt_regs* ctx) { 269 uint64_t pid_tgid = bpf_get_current_pid_tgid(); 270 pid_t pid = (pid_t)(pid_tgid >> 32); 271 PidData* pid_data = pid_config.lookup(&pid); 272 if (!pid_data) { 273 return 0; 274 } 275 276 GET_STATE(); 277 278 state->offsets = pid_data->offsets; 279 state->cur_cpu = bpf_get_smp_processor_id(); 280 state->python_stack_prog_call_cnt = 0; 281 282 Event* event = &state->event; 283 event->pid = pid; 284 event->tid = (pid_t)pid_tgid; 285 bpf_get_current_comm(&event->comm, sizeof(event->comm)); 286 287 // Get pointer of global PyThreadState, which should belong to the Thread 288 // currently holds the GIL 289 void* global_current_thread = (void*)0; 290 bpf_probe_read_user( 291 &global_current_thread, 292 sizeof(global_current_thread), 293 (void*)pid_data->current_state_addr); 294 295 struct task_struct* task = (struct task_struct*)bpf_get_current_task(); 296 #if __x86_64__ 297 // thread_struct->fs was renamed to fsbase in 298 // https://github.com/torvalds/linux/commit/296f781a4b7801ad9c1c0219f9e87b6c25e196fe 299 // so depending on kernel version, we need to account for that 300 #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 7, 0) 301 void* tls_base = (void*)task->thread.fs; 302 #else 303 void* tls_base = (void*)task->thread.fsbase; 304 #endif 305 #elif __aarch64__ 306 void* tls_base = (void*)task->thread.tp_value; 307 #else 308 #error "Unsupported platform" 309 #endif 310 311 // Read PyThreadState of this Thread from TLS 312 void* thread_state = get_thread_state(tls_base, pid_data); 313 314 // Check for matching between TLS PyThreadState and 315 // the global _PyThreadState_Current 316 event->thread_state_match = 317 get_thread_state_match(thread_state, global_current_thread); 318 319 // Read GIL state 320 event->gil_state = 321 get_gil_state(thread_state, global_current_thread, pid_data); 322 323 // Check for matching between pthread ID created current PyThreadState and 324 // pthread of actual current pthread 325 event->pthread_id_match = 326 get_pthread_id_match(thread_state, tls_base, pid_data); 327 328 // pre-initialize event struct in case any subprogram below fails 329 event->stack_status = STACK_STATUS_COMPLETE; 330 event->stack_len = 0; 331 332 if (thread_state != 0) { 333 // Get pointer to top frame from PyThreadState 334 bpf_probe_read_user( 335 &state->frame_ptr, 336 sizeof(void*), 337 thread_state + pid_data->offsets.PyThreadState_frame); 338 // jump to reading first set of Python frames 339 progs.call(ctx, PYTHON_STACK_PROG_IDX); 340 // we won't ever get here 341 } 342 343 return submit_sample(ctx, state); 344 } 345 346 static inline __attribute__((__always_inline__)) void get_names( 347 void* cur_frame, 348 void* code_ptr, 349 OffsetConfig* offsets, 350 Symbol* symbol, 351 void* ctx) { 352 // Figure out if we want to parse class name, basically checking the name of 353 // the first argument, 354 // ((PyTupleObject*)$frame->f_code->co_varnames)->ob_item[0] 355 // If it's 'self', we get the type and it's name, if it's cls, we just get 356 // the name. This is not perfect but there is no better way to figure this 357 // out from the code object. 358 void* args_ptr; 359 bpf_probe_read_user( 360 &args_ptr, sizeof(void*), code_ptr + offsets->PyCodeObject_varnames); 361 bpf_probe_read_user( 362 &args_ptr, sizeof(void*), args_ptr + offsets->PyTupleObject_item); 363 bpf_probe_read_user_str( 364 &symbol->name, sizeof(symbol->name), args_ptr + offsets->String_data); 365 366 // compare strings as ints to save instructions 367 char self_str[4] = {'s', 'e', 'l', 'f'}; 368 char cls_str[4] = {'c', 'l', 's', '\0'}; 369 bool first_self = *(int32_t*)symbol->name == *(int32_t*)self_str; 370 bool first_cls = *(int32_t*)symbol->name == *(int32_t*)cls_str; 371 372 // We re-use the same Symbol instance across loop iterations, which means 373 // we will have left-over data in the struct. Although this won't affect 374 // correctness of the result because we have '\0' at end of the strings read, 375 // it would affect effectiveness of the deduplication. 376 // Helper bpf_perf_prog_read_value clears the buffer on error, so here we 377 // (ab)use this behavior to clear the memory. It requires the size of Symbol 378 // to be different from struct bpf_perf_event_value, which we check at 379 // compilation time using the FAIL_COMPILATION_IF macro. 380 bpf_perf_prog_read_value(ctx, symbol, sizeof(Symbol)); 381 382 // Read class name from $frame->f_localsplus[0]->ob_type->tp_name. 383 if (first_self || first_cls) { 384 void* ptr; 385 bpf_probe_read_user( 386 &ptr, sizeof(void*), cur_frame + offsets->PyFrameObject_localsplus); 387 if (first_self) { 388 // we are working with an instance, first we need to get type 389 bpf_probe_read_user(&ptr, sizeof(void*), ptr + offsets->PyObject_type); 390 } 391 bpf_probe_read_user(&ptr, sizeof(void*), ptr + offsets->PyTypeObject_name); 392 bpf_probe_read_user_str(&symbol->classname, sizeof(symbol->classname), ptr); 393 } 394 395 void* pystr_ptr; 396 // read PyCodeObject's filename into symbol 397 bpf_probe_read_user( 398 &pystr_ptr, sizeof(void*), code_ptr + offsets->PyCodeObject_filename); 399 bpf_probe_read_user_str( 400 &symbol->file, sizeof(symbol->file), pystr_ptr + offsets->String_data); 401 // read PyCodeObject's name into symbol 402 bpf_probe_read_user( 403 &pystr_ptr, sizeof(void*), code_ptr + offsets->PyCodeObject_name); 404 bpf_probe_read_user_str( 405 &symbol->name, sizeof(symbol->name), pystr_ptr + offsets->String_data); 406 } 407 408 // get_frame_data reads current PyFrameObject filename/name and updates 409 // stack_info->frame_ptr with pointer to next PyFrameObject 410 static inline __attribute__((__always_inline__)) bool get_frame_data( 411 void** frame_ptr, 412 OffsetConfig* offsets, 413 Symbol* symbol, 414 // ctx is only used to call helper to clear symbol, see documentation below 415 void* ctx) { 416 void* cur_frame = *frame_ptr; 417 if (!cur_frame) { 418 return false; 419 } 420 void* code_ptr; 421 // read PyCodeObject first, if that fails, then no point reading next frame 422 bpf_probe_read_user( 423 &code_ptr, sizeof(void*), cur_frame + offsets->PyFrameObject_code); 424 if (!code_ptr) { 425 return false; 426 } 427 428 get_names(cur_frame, code_ptr, offsets, symbol, ctx); 429 430 // read next PyFrameObject pointer, update in place 431 bpf_probe_read_user( 432 frame_ptr, sizeof(void*), cur_frame + offsets->PyFrameObject_back); 433 434 return true; 435 } 436 437 // To avoid duplicate ids, every CPU needs to use different ids when inserting 438 // into the hashmap. NUM_CPUS is defined at PyPerf backend side and passed 439 // through CFlag. 440 static inline __attribute__((__always_inline__)) int64_t get_symbol_id( 441 sample_state_t* state, 442 Symbol* sym) { 443 int32_t* symbol_id_ptr = symbols.lookup(sym); 444 if (symbol_id_ptr) { 445 return *symbol_id_ptr; 446 } 447 // the symbol is new, bump the counter 448 int32_t symbol_id = state->symbol_counter * NUM_CPUS + state->cur_cpu; 449 state->symbol_counter++; 450 symbols.update(sym, &symbol_id); 451 return symbol_id; 452 } 453 454 int read_python_stack(struct pt_regs* ctx) { 455 GET_STATE(); 456 457 state->python_stack_prog_call_cnt++; 458 Event* sample = &state->event; 459 460 Symbol sym = {}; 461 bool last_res = false; 462 #pragma unroll 463 for (int i = 0; i < PYTHON_STACK_FRAMES_PER_PROG; i++) { 464 last_res = get_frame_data(&state->frame_ptr, &state->offsets, &sym, ctx); 465 if (last_res) { 466 uint32_t symbol_id = get_symbol_id(state, &sym); 467 int64_t cur_len = sample->stack_len; 468 if (cur_len >= 0 && cur_len < STACK_MAX_LEN) { 469 sample->stack[cur_len] = symbol_id; 470 sample->stack_len++; 471 } 472 } 473 } 474 475 if (!state->frame_ptr) { 476 sample->stack_status = STACK_STATUS_COMPLETE; 477 } else { 478 if (!last_res) { 479 sample->stack_status = STACK_STATUS_ERROR; 480 } else { 481 sample->stack_status = STACK_STATUS_TRUNCATED; 482 } 483 } 484 485 if (sample->stack_status == STACK_STATUS_TRUNCATED && 486 state->python_stack_prog_call_cnt < PYTHON_STACK_PROG_CNT) { 487 // read next batch of frames 488 progs.call(ctx, PYTHON_STACK_PROG_IDX); 489 } 490 491 return submit_sample(ctx, state); 492 } 493 )"; 494 495 } 496 } // namespace ebpf 497