1#!/usr/bin/env python 2# 3# offcputime Summarize off-CPU time by stack trace 4# For Linux, uses BCC, eBPF. 5# 6# USAGE: offcputime [-h] [-p PID | -t TID | -u | -k] [-U | -K] [-d] [-f] [-s] 7# [--stack-storage-size STACK_STORAGE_SIZE] 8# [-m MIN_BLOCK_TIME] [-M MAX_BLOCK_TIME] [--state STATE] 9# [duration] 10# 11# Copyright 2016 Netflix, Inc. 12# Licensed under the Apache License, Version 2.0 (the "License") 13# 14# 13-Jan-2016 Brendan Gregg Created this. 15# 27-Mar-2023 Rocky Xing Added option to show symbol offsets. 16# 04-Apr-2023 Rocky Xing Updated default stack storage size. 17 18from __future__ import print_function 19from bcc import BPF 20from sys import stderr 21import argparse 22import errno 23import signal 24 25# arg validation 26def positive_int(val): 27 try: 28 ival = int(val) 29 except ValueError: 30 raise argparse.ArgumentTypeError("must be an integer") 31 32 if ival < 0: 33 raise argparse.ArgumentTypeError("must be positive") 34 return ival 35 36def positive_nonzero_int(val): 37 ival = positive_int(val) 38 if ival == 0: 39 raise argparse.ArgumentTypeError("must be nonzero") 40 return ival 41 42def stack_id_err(stack_id): 43 # -EFAULT in get_stackid normally means the stack-trace is not available, 44 # Such as getting kernel stack trace in userspace code 45 return (stack_id < 0) and (stack_id != -errno.EFAULT) 46 47# arguments 48examples = """examples: 49 ./offcputime # trace off-CPU stack time until Ctrl-C 50 ./offcputime 5 # trace for 5 seconds only 51 ./offcputime -f 5 # 5 seconds, and output in folded format 52 ./offcputime -s 5 # 5 seconds, and show symbol offsets 53 ./offcputime -m 1000 # trace only events that last more than 1000 usec 54 ./offcputime -M 10000 # trace only events that last less than 10000 usec 55 ./offcputime -p 185 # only trace threads for PID 185 56 ./offcputime -t 188 # only trace thread 188 57 ./offcputime -u # only trace user threads (no kernel) 58 ./offcputime -k # only trace kernel threads (no user) 59 ./offcputime -U # only show user space stacks (no kernel) 60 ./offcputime -K # only show kernel space stacks (no user) 61""" 62parser = argparse.ArgumentParser( 63 description="Summarize off-CPU time by stack trace", 64 formatter_class=argparse.RawDescriptionHelpFormatter, 65 epilog=examples) 66thread_group = parser.add_mutually_exclusive_group() 67# Note: this script provides --pid and --tid flags but their arguments are 68# referred to internally using kernel nomenclature: TGID and PID. 69thread_group.add_argument("-p", "--pid", metavar="PID", dest="tgid", 70 help="trace this PID only", type=positive_int) 71thread_group.add_argument("-t", "--tid", metavar="TID", dest="pid", 72 help="trace this TID only", type=positive_int) 73thread_group.add_argument("-u", "--user-threads-only", action="store_true", 74 help="user threads only (no kernel threads)") 75thread_group.add_argument("-k", "--kernel-threads-only", action="store_true", 76 help="kernel threads only (no user threads)") 77stack_group = parser.add_mutually_exclusive_group() 78stack_group.add_argument("-U", "--user-stacks-only", action="store_true", 79 help="show stacks from user space only (no kernel space stacks)") 80stack_group.add_argument("-K", "--kernel-stacks-only", action="store_true", 81 help="show stacks from kernel space only (no user space stacks)") 82parser.add_argument("-d", "--delimited", action="store_true", 83 help="insert delimiter between kernel/user stacks") 84parser.add_argument("-f", "--folded", action="store_true", 85 help="output folded format") 86parser.add_argument("-s", "--offset", action="store_true", 87 help="show address offsets") 88parser.add_argument("--stack-storage-size", default=16384, 89 type=positive_nonzero_int, 90 help="the number of unique stack traces that can be stored and " 91 "displayed (default 16384)") 92parser.add_argument("duration", nargs="?", default=99999999, 93 type=positive_nonzero_int, 94 help="duration of trace, in seconds") 95parser.add_argument("-m", "--min-block-time", default=1, 96 type=positive_nonzero_int, 97 help="the amount of time in microseconds over which we " + 98 "store traces (default 1)") 99parser.add_argument("-M", "--max-block-time", default=(1 << 64) - 1, 100 type=positive_nonzero_int, 101 help="the amount of time in microseconds under which we " + 102 "store traces (default U64_MAX)") 103parser.add_argument("--state", type=positive_int, 104 help="filter on this thread state bitmask (eg, 2 == TASK_UNINTERRUPTIBLE" + 105 ") see include/linux/sched.h") 106parser.add_argument("--ebpf", action="store_true", 107 help=argparse.SUPPRESS) 108args = parser.parse_args() 109folded = args.folded 110duration = int(args.duration) 111debug = 0 112 113if args.folded and args.offset: 114 print("ERROR: can only use -f or -s. Exiting.") 115 exit() 116 117# signal handler 118def signal_ignore(signal, frame): 119 print() 120 121# define BPF program 122bpf_text = """ 123#include <uapi/linux/ptrace.h> 124#include <linux/sched.h> 125 126#define MINBLOCK_US MINBLOCK_US_VALUEULL 127#define MAXBLOCK_US MAXBLOCK_US_VALUEULL 128 129struct key_t { 130 u32 pid; 131 u32 tgid; 132 int user_stack_id; 133 int kernel_stack_id; 134 char name[TASK_COMM_LEN]; 135}; 136BPF_HASH(counts, struct key_t); 137BPF_HASH(start, u32); 138BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE); 139 140struct warn_event_t { 141 u32 pid; 142 u32 tgid; 143 u32 t_start; 144 u32 t_end; 145}; 146BPF_PERF_OUTPUT(warn_events); 147 148int oncpu(struct pt_regs *ctx, struct task_struct *prev) { 149 u32 pid = prev->pid; 150 u32 tgid = prev->tgid; 151 u64 ts, *tsp; 152 153 // record previous thread sleep time 154 if ((THREAD_FILTER) && (STATE_FILTER)) { 155 ts = bpf_ktime_get_ns(); 156 start.update(&pid, &ts); 157 } 158 159 // get the current thread's start time 160 pid = bpf_get_current_pid_tgid(); 161 tgid = bpf_get_current_pid_tgid() >> 32; 162 tsp = start.lookup(&pid); 163 if (tsp == 0) { 164 return 0; // missed start or filtered 165 } 166 167 // calculate current thread's delta time 168 u64 t_start = *tsp; 169 u64 t_end = bpf_ktime_get_ns(); 170 start.delete(&pid); 171 if (t_start > t_end) { 172 struct warn_event_t event = { 173 .pid = pid, 174 .tgid = tgid, 175 .t_start = t_start, 176 .t_end = t_end, 177 }; 178 warn_events.perf_submit(ctx, &event, sizeof(event)); 179 return 0; 180 } 181 u64 delta = t_end - t_start; 182 delta = delta / 1000; 183 if ((delta < MINBLOCK_US) || (delta > MAXBLOCK_US)) { 184 return 0; 185 } 186 187 // create map key 188 struct key_t key = {}; 189 190 key.pid = pid; 191 key.tgid = tgid; 192 key.user_stack_id = USER_STACK_GET; 193 key.kernel_stack_id = KERNEL_STACK_GET; 194 bpf_get_current_comm(&key.name, sizeof(key.name)); 195 196 counts.increment(key, delta); 197 return 0; 198} 199""" 200 201# set thread filter 202thread_context = "" 203if args.tgid is not None: 204 thread_context = "PID %d" % args.tgid 205 thread_filter = 'tgid == %d' % args.tgid 206elif args.pid is not None: 207 thread_context = "TID %d" % args.pid 208 thread_filter = 'pid == %d' % args.pid 209elif args.user_threads_only: 210 thread_context = "user threads" 211 thread_filter = '!(prev->flags & PF_KTHREAD)' 212elif args.kernel_threads_only: 213 thread_context = "kernel threads" 214 thread_filter = 'prev->flags & PF_KTHREAD' 215else: 216 thread_context = "all threads" 217 thread_filter = '1' 218if args.state == 0: 219 state_filter = 'prev->STATE_FIELD == 0' 220elif args.state: 221 # these states are sometimes bitmask checked 222 state_filter = 'prev->STATE_FIELD & %d' % args.state 223else: 224 state_filter = '1' 225bpf_text = bpf_text.replace('THREAD_FILTER', thread_filter) 226bpf_text = bpf_text.replace('STATE_FILTER', state_filter) 227if BPF.kernel_struct_has_field(b'task_struct', b'__state') == 1: 228 bpf_text = bpf_text.replace('STATE_FIELD', '__state') 229else: 230 bpf_text = bpf_text.replace('STATE_FIELD', 'state') 231 232# set stack storage size 233bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size)) 234bpf_text = bpf_text.replace('MINBLOCK_US_VALUE', str(args.min_block_time)) 235bpf_text = bpf_text.replace('MAXBLOCK_US_VALUE', str(args.max_block_time)) 236 237# handle stack args 238kernel_stack_get = "stack_traces.get_stackid(ctx, 0)" 239user_stack_get = "stack_traces.get_stackid(ctx, BPF_F_USER_STACK)" 240stack_context = "" 241if args.user_stacks_only: 242 stack_context = "user" 243 kernel_stack_get = "-1" 244elif args.kernel_stacks_only: 245 stack_context = "kernel" 246 user_stack_get = "-1" 247else: 248 stack_context = "user + kernel" 249bpf_text = bpf_text.replace('USER_STACK_GET', user_stack_get) 250bpf_text = bpf_text.replace('KERNEL_STACK_GET', kernel_stack_get) 251 252need_delimiter = args.delimited and not (args.kernel_stacks_only or 253 args.user_stacks_only) 254 255# check for an edge case; the code below will handle this case correctly 256# but ultimately nothing will be displayed 257if args.kernel_threads_only and args.user_stacks_only: 258 print("ERROR: Displaying user stacks for kernel threads " + 259 "doesn't make sense.", file=stderr) 260 exit(2) 261 262if debug or args.ebpf: 263 print(bpf_text) 264 if args.ebpf: 265 print("ERROR: Exiting") 266 exit(3) 267 268# initialize BPF 269b = BPF(text=bpf_text) 270b.attach_kprobe(event_re="^finish_task_switch$|^finish_task_switch\.isra\.\d$", 271 fn_name="oncpu") 272matched = b.num_open_kprobes() 273if matched == 0: 274 print("error: 0 functions traced. Exiting.", file=stderr) 275 exit(4) 276 277# header 278if not folded: 279 print("Tracing off-CPU time (us) of %s by %s stack" % 280 (thread_context, stack_context), end="") 281 if duration < 99999999: 282 print(" for %d secs." % duration) 283 else: 284 print("... Hit Ctrl-C to end.") 285 286 287def print_warn_event(cpu, data, size): 288 event = b["warn_events"].event(data) 289 # See https://github.com/iovisor/bcc/pull/3227 for those wondering how can this happen. 290 print("WARN: Skipped an event with negative duration: pid:%d, tgid:%d, off-cpu:%d, on-cpu:%d" 291 % (event.pid, event.tgid, event.t_start, event.t_end), 292 file=stderr) 293 294b["warn_events"].open_perf_buffer(print_warn_event) 295try: 296 duration_ms = duration * 1000 297 start_time_ms = int(BPF.monotonic_time() / 1000000) 298 while True: 299 elapsed_ms = int(BPF.monotonic_time() / 1000000) - start_time_ms 300 if elapsed_ms >= duration_ms: 301 break 302 b.perf_buffer_poll(timeout=duration_ms - elapsed_ms) 303except KeyboardInterrupt: 304 # as cleanup can take many seconds, trap Ctrl-C: 305 signal.signal(signal.SIGINT, signal_ignore) 306 307if not folded: 308 print() 309 310show_offset = False 311if args.offset: 312 show_offset = True 313 314missing_stacks = 0 315has_enomem = False 316counts = b.get_table("counts") 317stack_traces = b.get_table("stack_traces") 318for k, v in sorted(counts.items(), key=lambda counts: counts[1].value): 319 # handle get_stackid errors 320 if not args.user_stacks_only and stack_id_err(k.kernel_stack_id): 321 missing_stacks += 1 322 has_enomem = has_enomem or k.kernel_stack_id == -errno.ENOMEM 323 if not args.kernel_stacks_only and stack_id_err(k.user_stack_id): 324 missing_stacks += 1 325 has_enomem = has_enomem or k.user_stack_id == -errno.ENOMEM 326 327 # user stacks will be symbolized by tgid, not pid, to avoid the overhead 328 # of one symbol resolver per thread 329 user_stack = [] if k.user_stack_id < 0 else \ 330 stack_traces.walk(k.user_stack_id) 331 kernel_stack = [] if k.kernel_stack_id < 0 else \ 332 stack_traces.walk(k.kernel_stack_id) 333 334 if folded: 335 # print folded stack output 336 user_stack = list(user_stack) 337 kernel_stack = list(kernel_stack) 338 line = [k.name.decode('utf-8', 'replace')] 339 # if we failed to get the stack is, such as due to no space (-ENOMEM) or 340 # hash collision (-EEXIST), we still print a placeholder for consistency 341 if not args.kernel_stacks_only: 342 if stack_id_err(k.user_stack_id): 343 line.append("[Missed User Stack]") 344 else: 345 line.extend([b.sym(addr, k.tgid).decode('utf-8', 'replace') 346 for addr in reversed(user_stack)]) 347 if not args.user_stacks_only: 348 line.extend(["-"] if (need_delimiter and k.kernel_stack_id >= 0 and k.user_stack_id >= 0) else []) 349 if stack_id_err(k.kernel_stack_id): 350 line.append("[Missed Kernel Stack]") 351 else: 352 line.extend([b.ksym(addr).decode('utf-8', 'replace') 353 for addr in reversed(kernel_stack)]) 354 print("%s %d" % (";".join(line), v.value)) 355 else: 356 # print default multi-line stack output 357 if not args.user_stacks_only: 358 if stack_id_err(k.kernel_stack_id): 359 print(" [Missed Kernel Stack]") 360 else: 361 for addr in kernel_stack: 362 print(" %s" % b.ksym(addr, show_offset=show_offset).decode('utf-8', 'replace')) 363 if not args.kernel_stacks_only: 364 if need_delimiter and k.user_stack_id >= 0 and k.kernel_stack_id >= 0: 365 print(" --") 366 if stack_id_err(k.user_stack_id): 367 print(" [Missed User Stack]") 368 else: 369 for addr in user_stack: 370 print(" %s" % b.sym(addr, k.tgid, show_offset=show_offset).decode('utf-8', 'replace')) 371 print(" %-16s %s (%d)" % ("-", k.name.decode('utf-8', 'replace'), k.pid)) 372 print(" %d\n" % v.value) 373 374if missing_stacks > 0: 375 enomem_str = "" if not has_enomem else \ 376 " Consider increasing --stack-storage-size." 377 print("WARNING: %d stack traces lost and could not be displayed.%s" % 378 (missing_stacks, enomem_str), 379 file=stderr) 380