1#!/usr/bin/env python 2# @lint-avoid-python-3-compatibility-imports 3# 4# profile Profile CPU usage by sampling stack traces at a timed interval. 5# For Linux, uses BCC, BPF, perf_events. Embedded C. 6# 7# This is an efficient profiler, as stack traces are frequency counted in 8# kernel context, rather than passing every stack to user space for frequency 9# counting there. Only the unique stacks and counts are passed to user space 10# at the end of the profile, greatly reducing the kernel<->user transfer. 11# 12# By default CPU idle stacks are excluded by simply excluding PID 0. 13# 14# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). Under tools/old is 15# a version of this tool that may work on Linux 4.6 - 4.8. 16# 17# Copyright 2016 Netflix, Inc. 18# Licensed under the Apache License, Version 2.0 (the "License") 19# 20# THANKS: Alexei Starovoitov, who added proper BPF profiling support to Linux; 21# Sasha Goldshtein, Andrew Birchall, and Evgeny Vereshchagin, who wrote much 22# of the code here, borrowed from tracepoint.py and offcputime.py; and 23# Teng Qin, who added perf support in bcc. 24# 25# 15-Jul-2016 Brendan Gregg Created this. 26# 20-Oct-2016 " " Switched to use the new 4.9 support. 27# 26-Jan-2019 " " Changed to exclude CPU idle by default. 28# 11-Apr-2023 Rocky Xing Added option to increase hash storage size. 29 30from __future__ import print_function 31from bcc import BPF, PerfType, PerfSWConfig 32from bcc.containers import filter_by_containers 33from sys import stderr 34from time import sleep 35import argparse 36import signal 37import os 38import errno 39 40# 41# Process Arguments 42# 43 44# arg validation 45def positive_int(val): 46 try: 47 ival = int(val) 48 except ValueError: 49 raise argparse.ArgumentTypeError("must be an integer") 50 51 if ival < 0: 52 raise argparse.ArgumentTypeError("must be positive") 53 return ival 54 55def positive_int_list(val): 56 vlist = val.split(",") 57 if len(vlist) <= 0: 58 raise argparse.ArgumentTypeError("must be an integer list") 59 60 return [positive_int(v) for v in vlist] 61 62def positive_nonzero_int(val): 63 ival = positive_int(val) 64 if ival == 0: 65 raise argparse.ArgumentTypeError("must be nonzero") 66 return ival 67 68def stack_id_err(stack_id): 69 # -EFAULT in get_stackid normally means the stack-trace is not available, 70 # Such as getting kernel stack trace in userspace code 71 return (stack_id < 0) and (stack_id != -errno.EFAULT) 72 73# arguments 74examples = """examples: 75 ./profile # profile stack traces at 49 Hertz until Ctrl-C 76 ./profile -F 99 # profile stack traces at 99 Hertz 77 ./profile -c 1000000 # profile stack traces every 1 in a million events 78 ./profile 5 # profile at 49 Hertz for 5 seconds only 79 ./profile -f 5 # output in folded format for flame graphs 80 ./profile -p 185 # only profile process with PID 185 81 ./profile -L 185 # only profile thread with TID 185 82 ./profile -U # only show user space stacks (no kernel) 83 ./profile -K # only show kernel space stacks (no user) 84 ./profile --cgroupmap mappath # only trace cgroups in this BPF map 85 ./profile --mntnsmap mappath # only trace mount namespaces in the map 86""" 87parser = argparse.ArgumentParser( 88 description="Profile CPU stack traces at a timed interval", 89 formatter_class=argparse.RawDescriptionHelpFormatter, 90 epilog=examples) 91thread_group = parser.add_mutually_exclusive_group() 92thread_group.add_argument("-p", "--pid", type=positive_int_list, 93 help="profile process with one or more comma separated PIDs only") 94thread_group.add_argument("-L", "--tid", type=positive_int_list, 95 help="profile thread with one or more comma separated TIDs only") 96# TODO: add options for user/kernel threads only 97stack_group = parser.add_mutually_exclusive_group() 98stack_group.add_argument("-U", "--user-stacks-only", action="store_true", 99 help="show stacks from user space only (no kernel space stacks)") 100stack_group.add_argument("-K", "--kernel-stacks-only", action="store_true", 101 help="show stacks from kernel space only (no user space stacks)") 102sample_group = parser.add_mutually_exclusive_group() 103sample_group.add_argument("-F", "--frequency", type=positive_int, 104 help="sample frequency, Hertz") 105sample_group.add_argument("-c", "--count", type=positive_int, 106 help="sample period, number of events") 107parser.add_argument("-d", "--delimited", action="store_true", 108 help="insert delimiter between kernel/user stacks") 109parser.add_argument("-a", "--annotations", action="store_true", 110 help="add _[k] annotations to kernel frames") 111parser.add_argument("-I", "--include-idle", action="store_true", 112 help="include CPU idle stacks") 113parser.add_argument("-f", "--folded", action="store_true", 114 help="output folded format, one line per stack (for flame graphs)") 115parser.add_argument("--hash-storage-size", default=40960, 116 type=positive_nonzero_int, 117 help="the number of hash keys that can be stored and (default %(default)s)") 118parser.add_argument("--stack-storage-size", default=16384, 119 type=positive_nonzero_int, 120 help="the number of unique stack traces that can be stored and " 121 "displayed (default %(default)s)") 122parser.add_argument("duration", nargs="?", default=99999999, 123 type=positive_nonzero_int, 124 help="duration of trace, in seconds") 125parser.add_argument("-C", "--cpu", type=int, default=-1, 126 help="cpu number to run profile on") 127parser.add_argument("--ebpf", action="store_true", 128 help=argparse.SUPPRESS) 129parser.add_argument("--cgroupmap", 130 help="trace cgroups in this BPF map only") 131parser.add_argument("--mntnsmap", 132 help="trace mount namespaces in this BPF map only") 133 134# option logic 135args = parser.parse_args() 136duration = int(args.duration) 137debug = 0 138need_delimiter = args.delimited and not (args.kernel_stacks_only or 139 args.user_stacks_only) 140# TODO: add stack depth, and interval 141 142# 143# Setup BPF 144# 145 146# define BPF program 147bpf_text = """ 148#include <uapi/linux/ptrace.h> 149#include <uapi/linux/bpf_perf_event.h> 150#include <linux/sched.h> 151 152struct key_t { 153 u32 pid; 154 u64 kernel_ip; 155 int user_stack_id; 156 int kernel_stack_id; 157 char name[TASK_COMM_LEN]; 158}; 159BPF_HASH(counts, struct key_t, u64, HASH_STORAGE_SIZE); 160BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE); 161 162// This code gets a bit complex. Probably not suitable for casual hacking. 163 164int do_perf_event(struct bpf_perf_event_data *ctx) { 165 u32 tgid = 0; 166 u32 pid = 0; 167 168 struct bpf_pidns_info ns = {}; 169 if (USE_PIDNS && !bpf_get_ns_current_pid_tgid(PIDNS_DEV, PIDNS_INO, &ns, sizeof(struct bpf_pidns_info))) { 170 tgid = ns.tgid; 171 pid = ns.pid; 172 } else { 173 u64 id = bpf_get_current_pid_tgid(); 174 tgid = id >> 32; 175 pid = id; 176 } 177 178 if (IDLE_FILTER) 179 return 0; 180 181 if (!(THREAD_FILTER)) 182 return 0; 183 184 if (container_should_be_filtered()) { 185 return 0; 186 } 187 188 // create map key 189 struct key_t key = {.pid = tgid}; 190 bpf_get_current_comm(&key.name, sizeof(key.name)); 191 192 // get stacks 193 key.user_stack_id = USER_STACK_GET; 194 key.kernel_stack_id = KERNEL_STACK_GET; 195 196 if (key.kernel_stack_id >= 0) { 197 // populate extras to fix the kernel stack 198 u64 ip = PT_REGS_IP(&ctx->regs); 199 u64 page_offset; 200 201 // if ip isn't sane, leave key ips as zero for later checking 202#if defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE) 203 // x64, 4.16, ..., 4.11, etc., but some earlier kernel didn't have it 204 page_offset = __PAGE_OFFSET_BASE; 205#elif defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE_L4) 206 // x64, 4.17, and later 207#if defined(CONFIG_DYNAMIC_MEMORY_LAYOUT) && defined(CONFIG_X86_5LEVEL) 208 page_offset = __PAGE_OFFSET_BASE_L5; 209#else 210 page_offset = __PAGE_OFFSET_BASE_L4; 211#endif 212#else 213 // earlier x86_64 kernels, e.g., 4.6, comes here 214 // arm64, s390, powerpc, x86_32 215 page_offset = PAGE_OFFSET; 216#endif 217 218 if (ip > page_offset) { 219 key.kernel_ip = ip; 220 } 221 } 222 223 counts.increment(key); 224 return 0; 225} 226""" 227 228# pid-namespace translation 229try: 230 devinfo = os.stat("/proc/self/ns/pid") 231 bpf_text = bpf_text.replace('USE_PIDNS', "1") 232 bpf_text = bpf_text.replace('PIDNS_DEV', str(devinfo.st_dev)) 233 bpf_text = bpf_text.replace('PIDNS_INO', str(devinfo.st_ino)) 234except: 235 bpf_text = bpf_text.replace('USE_PIDNS', "0") 236 bpf_text = bpf_text.replace('PIDNS_DEV', "0") 237 bpf_text = bpf_text.replace('PIDNS_INO', "0") 238 239# set idle filter 240idle_filter = "pid == 0" 241if args.include_idle: 242 idle_filter = "0" 243bpf_text = bpf_text.replace('IDLE_FILTER', idle_filter) 244 245# set process/thread filter 246thread_context = "" 247thread_filter = "" 248if args.pid is not None: 249 thread_context = "PID %s" % args.pid 250 thread_filter = " || ".join("tgid == " + str(pid) for pid in args.pid) 251elif args.tid is not None: 252 thread_context = "TID %s" % args.tid 253 thread_filter = " || ".join("pid == " + str(tid) for tid in args.tid) 254else: 255 thread_context = "all threads" 256 thread_filter = '1' 257bpf_text = bpf_text.replace('THREAD_FILTER', thread_filter) 258 259# set stack storage size 260bpf_text = bpf_text.replace('HASH_STORAGE_SIZE', str(args.hash_storage_size)) 261bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size)) 262 263# handle stack args 264kernel_stack_get = "stack_traces.get_stackid(&ctx->regs, 0)" 265user_stack_get = "stack_traces.get_stackid(&ctx->regs, BPF_F_USER_STACK)" 266stack_context = "" 267if args.user_stacks_only: 268 stack_context = "user" 269 kernel_stack_get = "-1" 270elif args.kernel_stacks_only: 271 stack_context = "kernel" 272 user_stack_get = "-1" 273else: 274 stack_context = "user + kernel" 275bpf_text = bpf_text.replace('USER_STACK_GET', user_stack_get) 276bpf_text = bpf_text.replace('KERNEL_STACK_GET', kernel_stack_get) 277bpf_text = filter_by_containers(args) + bpf_text 278 279sample_freq = 0 280sample_period = 0 281if args.frequency: 282 sample_freq = args.frequency 283elif args.count: 284 sample_period = args.count 285else: 286 # If user didn't specify anything, use default 49Hz sampling 287 sample_freq = 49 288sample_context = "%s%d %s" % (("", sample_freq, "Hertz") if sample_freq 289 else ("every ", sample_period, "events")) 290 291# header 292if not args.folded: 293 print("Sampling at %s of %s by %s stack" % 294 (sample_context, thread_context, stack_context), end="") 295 if args.cpu >= 0: 296 print(" on CPU#{}".format(args.cpu), end="") 297 if duration < 99999999: 298 print(" for %d secs." % duration) 299 else: 300 print("... Hit Ctrl-C to end.") 301 302if debug or args.ebpf: 303 print(bpf_text) 304 if args.ebpf: 305 exit() 306 307# initialize BPF & perf_events 308b = BPF(text=bpf_text) 309b.attach_perf_event(ev_type=PerfType.SOFTWARE, 310 ev_config=PerfSWConfig.CPU_CLOCK, fn_name="do_perf_event", 311 sample_period=sample_period, sample_freq=sample_freq, cpu=args.cpu) 312 313# signal handler 314def signal_ignore(signal, frame): 315 print() 316 317# 318# Output Report 319# 320 321# collect samples 322try: 323 sleep(duration) 324except KeyboardInterrupt: 325 # as cleanup can take some time, trap Ctrl-C: 326 signal.signal(signal.SIGINT, signal_ignore) 327 328if not args.folded: 329 print() 330 331def aksym(addr): 332 if args.annotations: 333 return b.ksym(addr) + "_[k]".encode() 334 else: 335 return b.ksym(addr) 336 337# output stacks 338missing_stacks = 0 339has_collision = False 340counts = b.get_table("counts") 341htab_full = args.hash_storage_size == len(counts) 342stack_traces = b.get_table("stack_traces") 343for k, v in sorted(counts.items(), key=lambda counts: counts[1].value): 344 # handle get_stackid errors 345 if not args.user_stacks_only and stack_id_err(k.kernel_stack_id): 346 missing_stacks += 1 347 # hash collision (-EEXIST) suggests that the map size may be too small 348 has_collision = has_collision or k.kernel_stack_id == -errno.EEXIST 349 if not args.kernel_stacks_only and stack_id_err(k.user_stack_id): 350 missing_stacks += 1 351 has_collision = has_collision or k.user_stack_id == -errno.EEXIST 352 353 user_stack = [] if k.user_stack_id < 0 else \ 354 stack_traces.walk(k.user_stack_id) 355 kernel_tmp = [] if k.kernel_stack_id < 0 else \ 356 stack_traces.walk(k.kernel_stack_id) 357 358 # fix kernel stack 359 kernel_stack = [] 360 if k.kernel_stack_id >= 0: 361 for addr in kernel_tmp: 362 kernel_stack.append(addr) 363 # the later IP checking 364 if k.kernel_ip: 365 kernel_stack.insert(0, k.kernel_ip) 366 367 if args.folded: 368 # print folded stack output 369 user_stack = list(user_stack) 370 kernel_stack = list(kernel_stack) 371 line = [k.name.decode('utf-8', 'replace')] 372 # if we failed to get the stack is, such as due to no space (-ENOMEM) or 373 # hash collision (-EEXIST), we still print a placeholder for consistency 374 if not args.kernel_stacks_only: 375 if stack_id_err(k.user_stack_id): 376 line.append("[Missed User Stack]") 377 else: 378 line.extend([b.sym(addr, k.pid).decode('utf-8', 'replace') for addr in reversed(user_stack)]) 379 if not args.user_stacks_only: 380 line.extend(["-"] if (need_delimiter and k.kernel_stack_id >= 0 and k.user_stack_id >= 0) else []) 381 if stack_id_err(k.kernel_stack_id): 382 line.append("[Missed Kernel Stack]") 383 else: 384 line.extend([aksym(addr).decode('utf-8', 'replace') for addr in reversed(kernel_stack)]) 385 print("%s %d" % (";".join(line), v.value)) 386 else: 387 # print default multi-line stack output 388 if not args.user_stacks_only: 389 if stack_id_err(k.kernel_stack_id): 390 print(" [Missed Kernel Stack]") 391 else: 392 for addr in kernel_stack: 393 print(" %s" % aksym(addr).decode('utf-8', 'replace')) 394 if not args.kernel_stacks_only: 395 if need_delimiter and k.user_stack_id >= 0 and k.kernel_stack_id >= 0: 396 print(" --") 397 if stack_id_err(k.user_stack_id): 398 print(" [Missed User Stack]") 399 else: 400 for addr in user_stack: 401 print(" %s" % b.sym(addr, k.pid).decode('utf-8', 'replace')) 402 print(" %-16s %s (%d)" % ("-", k.name.decode('utf-8', 'replace'), k.pid)) 403 print(" %d\n" % v.value) 404 405# check missing 406if missing_stacks > 0: 407 enomem_str = "" if not has_collision else \ 408 " Consider increasing --stack-storage-size." 409 print("WARNING: %d stack traces could not be displayed.%s" % 410 (missing_stacks, enomem_str), 411 file=stderr) 412 413# check whether hash table is full 414if htab_full: 415 print("WARNING: hash table full. Consider increasing --hash-storage-size.", 416 file=stderr) 417