xref: /aosp_15_r20/external/bcc/tools/runqslower.py (revision 387f9dfdfa2baef462e92476d413c7bc2470293e)
1#!/usr/bin/env python
2# @lint-avoid-python-3-compatibility-imports
3#
4# runqslower    Trace long process scheduling delays.
5#               For Linux, uses BCC, eBPF.
6#
7# This script traces high scheduling delays between tasks being
8# ready to run and them running on CPU after that.
9#
10# USAGE: runqslower [-p PID] [-t TID] [-P] [min_us]
11#
12# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support).
13#
14# This measures the time a task spends waiting on a run queue for a turn
15# on-CPU, and shows this time as a individual events. This time should be small,
16# but a task may need to wait its turn due to CPU load.
17#
18# This measures two types of run queue latency:
19# 1. The time from a task being enqueued on a run queue to its context switch
20#    and execution. This traces ttwu_do_wakeup(), wake_up_new_task() ->
21#    finish_task_switch() with either raw tracepoints (if supported) or kprobes
22#    and instruments the run queue latency after a voluntary context switch.
23# 2. The time from when a task was involuntary context switched and still
24#    in the runnable state, to when it next executed. This is instrumented
25#    from finish_task_switch() alone.
26#
27# Copyright 2016 Cloudflare, Inc.
28# Licensed under the Apache License, Version 2.0 (the "License")
29#
30# 02-May-2018   Ivan Babrou   Created this.
31# 18-Nov-2019   Gergely Bod   BUG fix: Use bpf_probe_read_kernel_str() to extract the
32#                               process name from 'task_struct* next' in raw tp code.
33#                               bpf_get_current_comm() operates on the current task
34#                               which might already be different than 'next'.
35
36from __future__ import print_function
37from bcc import BPF
38import argparse
39from time import strftime
40
41# arguments
42examples = """examples:
43    ./runqslower         # trace run queue latency higher than 10000 us (default)
44    ./runqslower 1000    # trace run queue latency higher than 1000 us
45    ./runqslower -p 123  # trace pid 123
46    ./runqslower -t 123  # trace tid 123 (use for threads only)
47    ./runqslower -P      # also show previous task comm and TID
48"""
49parser = argparse.ArgumentParser(
50    description="Trace high run queue latency",
51    formatter_class=argparse.RawDescriptionHelpFormatter,
52    epilog=examples)
53parser.add_argument("min_us", nargs="?", default='10000',
54    help="minimum run queue latency to trace, in us (default 10000)")
55parser.add_argument("-P", "--previous", action="store_true",
56    help="also show previous task name and TID")
57parser.add_argument("--ebpf", action="store_true",
58    help=argparse.SUPPRESS)
59
60thread_group = parser.add_mutually_exclusive_group()
61thread_group.add_argument("-p", "--pid", metavar="PID", dest="pid",
62    help="trace this PID only", type=int)
63thread_group.add_argument("-t", "--tid", metavar="TID", dest="tid",
64    help="trace this TID only", type=int)
65args = parser.parse_args()
66
67min_us = int(args.min_us)
68debug = 0
69
70# define BPF program
71bpf_text = """
72#include <uapi/linux/ptrace.h>
73#include <linux/sched.h>
74#include <linux/nsproxy.h>
75#include <linux/pid_namespace.h>
76
77BPF_ARRAY(start, u64, MAX_PID);
78
79struct data_t {
80    u32 pid;
81    u32 prev_pid;
82    char task[TASK_COMM_LEN];
83    char prev_task[TASK_COMM_LEN];
84    u64 delta_us;
85};
86
87BPF_PERF_OUTPUT(events);
88
89// record enqueue timestamp
90static int trace_enqueue(u32 tgid, u32 pid)
91{
92    if (FILTER_PID || FILTER_TGID || pid == 0)
93        return 0;
94    u64 ts = bpf_ktime_get_ns();
95    start.update(&pid, &ts);
96    return 0;
97}
98"""
99
100bpf_text_kprobe = """
101int trace_wake_up_new_task(struct pt_regs *ctx, struct task_struct *p)
102{
103    return trace_enqueue(p->tgid, p->pid);
104}
105
106int trace_ttwu_do_wakeup(struct pt_regs *ctx, struct rq *rq, struct task_struct *p,
107    int wake_flags)
108{
109    return trace_enqueue(p->tgid, p->pid);
110}
111
112// calculate latency
113int trace_run(struct pt_regs *ctx, struct task_struct *prev)
114{
115    u32 pid, tgid;
116
117    // ivcsw: treat like an enqueue event and store timestamp
118    if (prev->STATE_FIELD == TASK_RUNNING) {
119        tgid = prev->tgid;
120        pid = prev->pid;
121        u64 ts = bpf_ktime_get_ns();
122        if (pid != 0) {
123            if (!(FILTER_PID) && !(FILTER_TGID)) {
124                start.update(&pid, &ts);
125            }
126        }
127    }
128
129    pid = bpf_get_current_pid_tgid();
130
131    u64 *tsp, delta_us;
132
133    // fetch timestamp and calculate delta
134    tsp = start.lookup(&pid);
135    if ((tsp == 0) || (*tsp == 0)) {
136        return 0;   // missed enqueue
137    }
138    delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
139
140    if (FILTER_US)
141        return 0;
142
143    struct data_t data = {};
144    data.pid = pid;
145    data.prev_pid = prev->pid;
146    data.delta_us = delta_us;
147    bpf_get_current_comm(&data.task, sizeof(data.task));
148    bpf_probe_read_kernel_str(&data.prev_task, sizeof(data.prev_task), prev->comm);
149
150    // output
151    events.perf_submit(ctx, &data, sizeof(data));
152
153    //array map has no delete method, set ts to 0 instead
154    *tsp = 0;
155    return 0;
156}
157"""
158
159bpf_text_raw_tp = """
160RAW_TRACEPOINT_PROBE(sched_wakeup)
161{
162    // TP_PROTO(struct task_struct *p)
163    struct task_struct *p = (struct task_struct *)ctx->args[0];
164    return trace_enqueue(p->tgid, p->pid);
165}
166
167RAW_TRACEPOINT_PROBE(sched_wakeup_new)
168{
169    // TP_PROTO(struct task_struct *p)
170    struct task_struct *p = (struct task_struct *)ctx->args[0];
171    u32 tgid, pid;
172
173    bpf_probe_read_kernel(&tgid, sizeof(tgid), &p->tgid);
174    bpf_probe_read_kernel(&pid, sizeof(pid), &p->pid);
175    return trace_enqueue(tgid, pid);
176}
177
178RAW_TRACEPOINT_PROBE(sched_switch)
179{
180    // TP_PROTO(bool preempt, struct task_struct *prev, struct task_struct *next)
181    struct task_struct *prev = (struct task_struct *)ctx->args[1];
182    struct task_struct *next= (struct task_struct *)ctx->args[2];
183    u32 tgid, pid;
184    long state;
185
186    // ivcsw: treat like an enqueue event and store timestamp
187    bpf_probe_read_kernel(&state, sizeof(long), (const void *)&prev->STATE_FIELD);
188    bpf_probe_read_kernel(&pid, sizeof(prev->pid), &prev->pid);
189    if (state == TASK_RUNNING) {
190        bpf_probe_read_kernel(&tgid, sizeof(prev->tgid), &prev->tgid);
191        u64 ts = bpf_ktime_get_ns();
192        if (pid != 0) {
193            if (!(FILTER_PID) && !(FILTER_TGID)) {
194                start.update(&pid, &ts);
195            }
196        }
197
198    }
199
200    u32 prev_pid;
201    u64 *tsp, delta_us;
202
203    prev_pid = pid;
204    bpf_probe_read_kernel(&pid, sizeof(next->pid), &next->pid);
205
206    // fetch timestamp and calculate delta
207    tsp = start.lookup(&pid);
208    if ((tsp == 0) || (*tsp == 0)) {
209        return 0;   // missed enqueue
210    }
211    delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
212
213    if (FILTER_US)
214        return 0;
215
216    struct data_t data = {};
217    data.pid = pid;
218    data.prev_pid = prev_pid;
219    data.delta_us = delta_us;
220    bpf_probe_read_kernel_str(&data.task, sizeof(data.task), next->comm);
221    bpf_probe_read_kernel_str(&data.prev_task, sizeof(data.prev_task), prev->comm);
222
223    // output
224    events.perf_submit(ctx, &data, sizeof(data));
225
226    //array map has no delete method, set ts to 0 instead
227    *tsp = 0;
228    return 0;
229}
230"""
231
232is_support_raw_tp = BPF.support_raw_tracepoint()
233if is_support_raw_tp:
234    bpf_text += bpf_text_raw_tp
235else:
236    bpf_text += bpf_text_kprobe
237
238# code substitutions
239if BPF.kernel_struct_has_field(b'task_struct', b'__state') == 1:
240    bpf_text = bpf_text.replace('STATE_FIELD', '__state')
241else:
242    bpf_text = bpf_text.replace('STATE_FIELD', 'state')
243if min_us == 0:
244    bpf_text = bpf_text.replace('FILTER_US', '0')
245else:
246    bpf_text = bpf_text.replace('FILTER_US', 'delta_us <= %s' % str(min_us))
247
248if args.tid:
249    bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % args.tid)
250else:
251    bpf_text = bpf_text.replace('FILTER_PID', '0')
252
253if args.pid:
254    bpf_text = bpf_text.replace('FILTER_TGID', 'tgid != %s' % args.pid)
255else:
256    bpf_text = bpf_text.replace('FILTER_TGID', '0')
257
258if debug or args.ebpf:
259    print(bpf_text)
260    if args.ebpf:
261        exit()
262
263# process event
264def print_event(cpu, data, size):
265    event = b["events"].event(data)
266    if args.previous:
267        print("%-8s %-16s %-6s %14s %-16s %-6s" % (strftime("%H:%M:%S"), event.task.decode('utf-8', 'replace'), event.pid, event.delta_us, event.prev_task.decode('utf-8', 'replace'), event.prev_pid))
268    else:
269        print("%-8s %-16s %-6s %14s" % (strftime("%H:%M:%S"), event.task.decode('utf-8', 'replace'), event.pid, event.delta_us))
270
271max_pid = int(open("/proc/sys/kernel/pid_max").read())
272
273# load BPF program
274b = BPF(text=bpf_text, cflags=["-DMAX_PID=%d" % max_pid])
275if not is_support_raw_tp:
276    b.attach_kprobe(event="ttwu_do_wakeup", fn_name="trace_ttwu_do_wakeup")
277    b.attach_kprobe(event="wake_up_new_task", fn_name="trace_wake_up_new_task")
278    b.attach_kprobe(event_re="^finish_task_switch$|^finish_task_switch\.isra\.\d$",
279                    fn_name="trace_run")
280
281print("Tracing run queue latency higher than %d us" % min_us)
282if args.previous:
283    print("%-8s %-16s %-6s %14s %-16s %-6s" % ("TIME", "COMM", "TID", "LAT(us)", "PREV COMM", "PREV TID"))
284else:
285    print("%-8s %-16s %-6s %14s" % ("TIME", "COMM", "TID", "LAT(us)"))
286
287# read events
288b["events"].open_perf_buffer(print_event, page_cnt=64)
289while 1:
290    try:
291        b.perf_buffer_poll()
292    except KeyboardInterrupt:
293        exit()
294