xref: /aosp_15_r20/external/bcc/tools/profile.py (revision 387f9dfdfa2baef462e92476d413c7bc2470293e)
1#!/usr/bin/env python
2# @lint-avoid-python-3-compatibility-imports
3#
4# profile  Profile CPU usage by sampling stack traces at a timed interval.
5#          For Linux, uses BCC, BPF, perf_events. Embedded C.
6#
7# This is an efficient profiler, as stack traces are frequency counted in
8# kernel context, rather than passing every stack to user space for frequency
9# counting there. Only the unique stacks and counts are passed to user space
10# at the end of the profile, greatly reducing the kernel<->user transfer.
11#
12# By default CPU idle stacks are excluded by simply excluding PID 0.
13#
14# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). Under tools/old is
15# a version of this tool that may work on Linux 4.6 - 4.8.
16#
17# Copyright 2016 Netflix, Inc.
18# Licensed under the Apache License, Version 2.0 (the "License")
19#
20# THANKS: Alexei Starovoitov, who added proper BPF profiling support to Linux;
21# Sasha Goldshtein, Andrew Birchall, and Evgeny Vereshchagin, who wrote much
22# of the code here, borrowed from tracepoint.py and offcputime.py; and
23# Teng Qin, who added perf support in bcc.
24#
25# 15-Jul-2016   Brendan Gregg   Created this.
26# 20-Oct-2016      "      "     Switched to use the new 4.9 support.
27# 26-Jan-2019      "      "     Changed to exclude CPU idle by default.
28# 11-Apr-2023   Rocky Xing      Added option to increase hash storage size.
29
30from __future__ import print_function
31from bcc import BPF, PerfType, PerfSWConfig
32from bcc.containers import filter_by_containers
33from sys import stderr
34from time import sleep
35import argparse
36import signal
37import os
38import errno
39
40#
41# Process Arguments
42#
43
44# arg validation
45def positive_int(val):
46    try:
47        ival = int(val)
48    except ValueError:
49        raise argparse.ArgumentTypeError("must be an integer")
50
51    if ival < 0:
52        raise argparse.ArgumentTypeError("must be positive")
53    return ival
54
55def positive_int_list(val):
56    vlist = val.split(",")
57    if len(vlist) <= 0:
58        raise argparse.ArgumentTypeError("must be an integer list")
59
60    return [positive_int(v) for v in vlist]
61
62def positive_nonzero_int(val):
63    ival = positive_int(val)
64    if ival == 0:
65        raise argparse.ArgumentTypeError("must be nonzero")
66    return ival
67
68def stack_id_err(stack_id):
69    # -EFAULT in get_stackid normally means the stack-trace is not available,
70    # Such as getting kernel stack trace in userspace code
71    return (stack_id < 0) and (stack_id != -errno.EFAULT)
72
73# arguments
74examples = """examples:
75    ./profile             # profile stack traces at 49 Hertz until Ctrl-C
76    ./profile -F 99       # profile stack traces at 99 Hertz
77    ./profile -c 1000000  # profile stack traces every 1 in a million events
78    ./profile 5           # profile at 49 Hertz for 5 seconds only
79    ./profile -f 5        # output in folded format for flame graphs
80    ./profile -p 185      # only profile process with PID 185
81    ./profile -L 185      # only profile thread with TID 185
82    ./profile -U          # only show user space stacks (no kernel)
83    ./profile -K          # only show kernel space stacks (no user)
84    ./profile --cgroupmap mappath  # only trace cgroups in this BPF map
85    ./profile --mntnsmap mappath   # only trace mount namespaces in the map
86"""
87parser = argparse.ArgumentParser(
88    description="Profile CPU stack traces at a timed interval",
89    formatter_class=argparse.RawDescriptionHelpFormatter,
90    epilog=examples)
91thread_group = parser.add_mutually_exclusive_group()
92thread_group.add_argument("-p", "--pid", type=positive_int_list,
93    help="profile process with one or more comma separated PIDs only")
94thread_group.add_argument("-L", "--tid", type=positive_int_list,
95    help="profile thread with one or more comma separated TIDs only")
96# TODO: add options for user/kernel threads only
97stack_group = parser.add_mutually_exclusive_group()
98stack_group.add_argument("-U", "--user-stacks-only", action="store_true",
99    help="show stacks from user space only (no kernel space stacks)")
100stack_group.add_argument("-K", "--kernel-stacks-only", action="store_true",
101    help="show stacks from kernel space only (no user space stacks)")
102sample_group = parser.add_mutually_exclusive_group()
103sample_group.add_argument("-F", "--frequency", type=positive_int,
104    help="sample frequency, Hertz")
105sample_group.add_argument("-c", "--count", type=positive_int,
106    help="sample period, number of events")
107parser.add_argument("-d", "--delimited", action="store_true",
108    help="insert delimiter between kernel/user stacks")
109parser.add_argument("-a", "--annotations", action="store_true",
110    help="add _[k] annotations to kernel frames")
111parser.add_argument("-I", "--include-idle", action="store_true",
112    help="include CPU idle stacks")
113parser.add_argument("-f", "--folded", action="store_true",
114    help="output folded format, one line per stack (for flame graphs)")
115parser.add_argument("--hash-storage-size", default=40960,
116    type=positive_nonzero_int,
117    help="the number of hash keys that can be stored and (default %(default)s)")
118parser.add_argument("--stack-storage-size", default=16384,
119    type=positive_nonzero_int,
120    help="the number of unique stack traces that can be stored and "
121        "displayed (default %(default)s)")
122parser.add_argument("duration", nargs="?", default=99999999,
123    type=positive_nonzero_int,
124    help="duration of trace, in seconds")
125parser.add_argument("-C", "--cpu", type=int, default=-1,
126    help="cpu number to run profile on")
127parser.add_argument("--ebpf", action="store_true",
128    help=argparse.SUPPRESS)
129parser.add_argument("--cgroupmap",
130    help="trace cgroups in this BPF map only")
131parser.add_argument("--mntnsmap",
132    help="trace mount namespaces in this BPF map only")
133
134# option logic
135args = parser.parse_args()
136duration = int(args.duration)
137debug = 0
138need_delimiter = args.delimited and not (args.kernel_stacks_only or
139    args.user_stacks_only)
140# TODO: add stack depth, and interval
141
142#
143# Setup BPF
144#
145
146# define BPF program
147bpf_text = """
148#include <uapi/linux/ptrace.h>
149#include <uapi/linux/bpf_perf_event.h>
150#include <linux/sched.h>
151
152struct key_t {
153    u32 pid;
154    u64 kernel_ip;
155    int user_stack_id;
156    int kernel_stack_id;
157    char name[TASK_COMM_LEN];
158};
159BPF_HASH(counts, struct key_t, u64, HASH_STORAGE_SIZE);
160BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE);
161
162// This code gets a bit complex. Probably not suitable for casual hacking.
163
164int do_perf_event(struct bpf_perf_event_data *ctx) {
165    u32 tgid = 0;
166    u32 pid = 0;
167
168    struct bpf_pidns_info ns = {};
169    if (USE_PIDNS && !bpf_get_ns_current_pid_tgid(PIDNS_DEV, PIDNS_INO, &ns, sizeof(struct bpf_pidns_info))) {
170        tgid = ns.tgid;
171        pid = ns.pid;
172    } else {
173        u64 id = bpf_get_current_pid_tgid();
174        tgid = id >> 32;
175        pid = id;
176    }
177
178    if (IDLE_FILTER)
179        return 0;
180
181    if (!(THREAD_FILTER))
182        return 0;
183
184    if (container_should_be_filtered()) {
185        return 0;
186    }
187
188    // create map key
189    struct key_t key = {.pid = tgid};
190    bpf_get_current_comm(&key.name, sizeof(key.name));
191
192    // get stacks
193    key.user_stack_id = USER_STACK_GET;
194    key.kernel_stack_id = KERNEL_STACK_GET;
195
196    if (key.kernel_stack_id >= 0) {
197        // populate extras to fix the kernel stack
198        u64 ip = PT_REGS_IP(&ctx->regs);
199        u64 page_offset;
200
201        // if ip isn't sane, leave key ips as zero for later checking
202#if defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE)
203        // x64, 4.16, ..., 4.11, etc., but some earlier kernel didn't have it
204        page_offset = __PAGE_OFFSET_BASE;
205#elif defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE_L4)
206        // x64, 4.17, and later
207#if defined(CONFIG_DYNAMIC_MEMORY_LAYOUT) && defined(CONFIG_X86_5LEVEL)
208        page_offset = __PAGE_OFFSET_BASE_L5;
209#else
210        page_offset = __PAGE_OFFSET_BASE_L4;
211#endif
212#else
213        // earlier x86_64 kernels, e.g., 4.6, comes here
214        // arm64, s390, powerpc, x86_32
215        page_offset = PAGE_OFFSET;
216#endif
217
218        if (ip > page_offset) {
219            key.kernel_ip = ip;
220        }
221    }
222
223    counts.increment(key);
224    return 0;
225}
226"""
227
228# pid-namespace translation
229try:
230    devinfo = os.stat("/proc/self/ns/pid")
231    bpf_text = bpf_text.replace('USE_PIDNS', "1")
232    bpf_text = bpf_text.replace('PIDNS_DEV', str(devinfo.st_dev))
233    bpf_text = bpf_text.replace('PIDNS_INO', str(devinfo.st_ino))
234except:
235    bpf_text = bpf_text.replace('USE_PIDNS', "0")
236    bpf_text = bpf_text.replace('PIDNS_DEV', "0")
237    bpf_text = bpf_text.replace('PIDNS_INO', "0")
238
239# set idle filter
240idle_filter = "pid == 0"
241if args.include_idle:
242    idle_filter = "0"
243bpf_text = bpf_text.replace('IDLE_FILTER', idle_filter)
244
245# set process/thread filter
246thread_context = ""
247thread_filter = ""
248if args.pid is not None:
249    thread_context = "PID %s" % args.pid
250    thread_filter = " || ".join("tgid == " + str(pid) for pid in args.pid)
251elif args.tid is not None:
252    thread_context = "TID %s" % args.tid
253    thread_filter = " || ".join("pid == " + str(tid) for tid in args.tid)
254else:
255    thread_context = "all threads"
256    thread_filter = '1'
257bpf_text = bpf_text.replace('THREAD_FILTER', thread_filter)
258
259# set stack storage size
260bpf_text = bpf_text.replace('HASH_STORAGE_SIZE', str(args.hash_storage_size))
261bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size))
262
263# handle stack args
264kernel_stack_get = "stack_traces.get_stackid(&ctx->regs, 0)"
265user_stack_get = "stack_traces.get_stackid(&ctx->regs, BPF_F_USER_STACK)"
266stack_context = ""
267if args.user_stacks_only:
268    stack_context = "user"
269    kernel_stack_get = "-1"
270elif args.kernel_stacks_only:
271    stack_context = "kernel"
272    user_stack_get = "-1"
273else:
274    stack_context = "user + kernel"
275bpf_text = bpf_text.replace('USER_STACK_GET', user_stack_get)
276bpf_text = bpf_text.replace('KERNEL_STACK_GET', kernel_stack_get)
277bpf_text = filter_by_containers(args) + bpf_text
278
279sample_freq = 0
280sample_period = 0
281if args.frequency:
282    sample_freq = args.frequency
283elif args.count:
284    sample_period = args.count
285else:
286    # If user didn't specify anything, use default 49Hz sampling
287    sample_freq = 49
288sample_context = "%s%d %s" % (("", sample_freq, "Hertz") if sample_freq
289                         else ("every ", sample_period, "events"))
290
291# header
292if not args.folded:
293    print("Sampling at %s of %s by %s stack" %
294        (sample_context, thread_context, stack_context), end="")
295    if args.cpu >= 0:
296        print(" on CPU#{}".format(args.cpu), end="")
297    if duration < 99999999:
298        print(" for %d secs." % duration)
299    else:
300        print("... Hit Ctrl-C to end.")
301
302if debug or args.ebpf:
303    print(bpf_text)
304    if args.ebpf:
305        exit()
306
307# initialize BPF & perf_events
308b = BPF(text=bpf_text)
309b.attach_perf_event(ev_type=PerfType.SOFTWARE,
310    ev_config=PerfSWConfig.CPU_CLOCK, fn_name="do_perf_event",
311    sample_period=sample_period, sample_freq=sample_freq, cpu=args.cpu)
312
313# signal handler
314def signal_ignore(signal, frame):
315    print()
316
317#
318# Output Report
319#
320
321# collect samples
322try:
323    sleep(duration)
324except KeyboardInterrupt:
325    # as cleanup can take some time, trap Ctrl-C:
326    signal.signal(signal.SIGINT, signal_ignore)
327
328if not args.folded:
329    print()
330
331def aksym(addr):
332    if args.annotations:
333        return b.ksym(addr) + "_[k]".encode()
334    else:
335        return b.ksym(addr)
336
337# output stacks
338missing_stacks = 0
339has_collision = False
340counts = b.get_table("counts")
341htab_full = args.hash_storage_size == len(counts)
342stack_traces = b.get_table("stack_traces")
343for k, v in sorted(counts.items(), key=lambda counts: counts[1].value):
344    # handle get_stackid errors
345    if not args.user_stacks_only and stack_id_err(k.kernel_stack_id):
346        missing_stacks += 1
347        # hash collision (-EEXIST) suggests that the map size may be too small
348        has_collision = has_collision or k.kernel_stack_id == -errno.EEXIST
349    if not args.kernel_stacks_only and stack_id_err(k.user_stack_id):
350        missing_stacks += 1
351        has_collision = has_collision or k.user_stack_id == -errno.EEXIST
352
353    user_stack = [] if k.user_stack_id < 0 else \
354        stack_traces.walk(k.user_stack_id)
355    kernel_tmp = [] if k.kernel_stack_id < 0 else \
356        stack_traces.walk(k.kernel_stack_id)
357
358    # fix kernel stack
359    kernel_stack = []
360    if k.kernel_stack_id >= 0:
361        for addr in kernel_tmp:
362            kernel_stack.append(addr)
363        # the later IP checking
364        if k.kernel_ip:
365            kernel_stack.insert(0, k.kernel_ip)
366
367    if args.folded:
368        # print folded stack output
369        user_stack = list(user_stack)
370        kernel_stack = list(kernel_stack)
371        line = [k.name.decode('utf-8', 'replace')]
372        # if we failed to get the stack is, such as due to no space (-ENOMEM) or
373        # hash collision (-EEXIST), we still print a placeholder for consistency
374        if not args.kernel_stacks_only:
375            if stack_id_err(k.user_stack_id):
376                line.append("[Missed User Stack]")
377            else:
378                line.extend([b.sym(addr, k.pid).decode('utf-8', 'replace') for addr in reversed(user_stack)])
379        if not args.user_stacks_only:
380            line.extend(["-"] if (need_delimiter and k.kernel_stack_id >= 0 and k.user_stack_id >= 0) else [])
381            if stack_id_err(k.kernel_stack_id):
382                line.append("[Missed Kernel Stack]")
383            else:
384                line.extend([aksym(addr).decode('utf-8', 'replace') for addr in reversed(kernel_stack)])
385        print("%s %d" % (";".join(line), v.value))
386    else:
387        # print default multi-line stack output
388        if not args.user_stacks_only:
389            if stack_id_err(k.kernel_stack_id):
390                print("    [Missed Kernel Stack]")
391            else:
392                for addr in kernel_stack:
393                    print("    %s" % aksym(addr).decode('utf-8', 'replace'))
394        if not args.kernel_stacks_only:
395            if need_delimiter and k.user_stack_id >= 0 and k.kernel_stack_id >= 0:
396                print("    --")
397            if stack_id_err(k.user_stack_id):
398                print("    [Missed User Stack]")
399            else:
400                for addr in user_stack:
401                    print("    %s" % b.sym(addr, k.pid).decode('utf-8', 'replace'))
402        print("    %-16s %s (%d)" % ("-", k.name.decode('utf-8', 'replace'), k.pid))
403        print("        %d\n" % v.value)
404
405# check missing
406if missing_stacks > 0:
407    enomem_str = "" if not has_collision else \
408        " Consider increasing --stack-storage-size."
409    print("WARNING: %d stack traces could not be displayed.%s" %
410        (missing_stacks, enomem_str),
411        file=stderr)
412
413# check whether hash table is full
414if htab_full:
415    print("WARNING: hash table full. Consider increasing --hash-storage-size.",
416        file=stderr)
417