xref: /aosp_15_r20/external/bcc/tools/biotop.py (revision 387f9dfdfa2baef462e92476d413c7bc2470293e)
1#!/usr/bin/env python
2# @lint-avoid-python-3-compatibility-imports
3#
4# biotop  block device (disk) I/O by process.
5#         For Linux, uses BCC, eBPF.
6#
7# USAGE: biotop.py [-h] [-C] [-r MAXROWS] [-p PID] [interval] [count]
8#
9# This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O
10# request, as well as a starting timestamp for calculating I/O latency.
11#
12# Copyright 2016 Netflix, Inc.
13# Licensed under the Apache License, Version 2.0 (the "License")
14#
15# 06-Feb-2016   Brendan Gregg   Created this.
16# 17-Mar-2022   Rocky Xing      Added PID filter support.
17# 01-Aug-2023   Jerome Marchand Added support for block tracepoints
18
19from __future__ import print_function
20from bcc import BPF
21from time import sleep, strftime
22import argparse
23from subprocess import call
24
25# arguments
26examples = """examples:
27    ./biotop            # block device I/O top, 1 second refresh
28    ./biotop -C         # don't clear the screen
29    ./biotop -p 181     # only trace PID 181
30    ./biotop 5          # 5 second summaries
31    ./biotop 5 10       # 5 second summaries, 10 times only
32"""
33parser = argparse.ArgumentParser(
34    description="Block device (disk) I/O by process",
35    formatter_class=argparse.RawDescriptionHelpFormatter,
36    epilog=examples)
37parser.add_argument("-C", "--noclear", action="store_true",
38    help="don't clear the screen")
39parser.add_argument("-r", "--maxrows", default=20,
40    help="maximum rows to print, default 20")
41parser.add_argument("-p", "--pid", type=int, metavar="PID",
42    help="trace this PID only")
43parser.add_argument("interval", nargs="?", default=1,
44    help="output interval, in seconds")
45parser.add_argument("count", nargs="?", default=99999999,
46    help="number of outputs")
47parser.add_argument("--ebpf", action="store_true",
48    help=argparse.SUPPRESS)
49args = parser.parse_args()
50interval = int(args.interval)
51countdown = int(args.count)
52maxrows = int(args.maxrows)
53clear = not int(args.noclear)
54
55# linux stats
56loadavg = "/proc/loadavg"
57diskstats = "/proc/diskstats"
58
59# load BPF program
60bpf_text = """
61#include <uapi/linux/ptrace.h>
62#include <linux/blk-mq.h>
63
64// for saving the timestamp and __data_len of each request
65struct start_req_t {
66    u64 ts;
67    u64 data_len;
68};
69
70// for saving process info by request
71struct who_t {
72    u32 pid;
73    char name[TASK_COMM_LEN];
74};
75
76// the key for the output summary
77struct info_t {
78    u32 pid;
79    int rwflag;
80    int major;
81    int minor;
82    char name[TASK_COMM_LEN];
83};
84
85// the value of the output summary
86struct val_t {
87    u64 bytes;
88    u64 us;
89    u32 io;
90};
91
92struct tp_args {
93    u64 __unused__;
94    dev_t dev;
95    sector_t sector;
96    unsigned int nr_sector;
97    unsigned int bytes;
98    char rwbs[8];
99    char comm[16];
100    char cmd[];
101};
102
103struct hash_key {
104    dev_t dev;
105    u32 _pad;
106    sector_t sector;
107};
108
109BPF_HASH(start, struct hash_key, struct start_req_t);
110BPF_HASH(whobyreq, struct hash_key, struct who_t);
111BPF_HASH(counts, struct info_t, struct val_t);
112
113static dev_t ddevt(struct gendisk *disk) {
114    return (disk->major  << 20) | disk->first_minor;
115}
116
117// cache PID and comm by-req
118static int __trace_pid_start(struct hash_key key)
119{
120    struct who_t who;
121    u32 pid;
122
123    if (bpf_get_current_comm(&who.name, sizeof(who.name)) == 0) {
124        pid = bpf_get_current_pid_tgid() >> 32;
125        if (FILTER_PID)
126            return 0;
127
128        who.pid = pid;
129        whobyreq.update(&key, &who);
130    }
131
132    return 0;
133}
134
135int trace_pid_start(struct pt_regs *ctx, struct request *req)
136{
137    struct hash_key key = {
138        .dev = ddevt(req->__RQ_DISK__),
139        .sector = req->__sector
140    };
141
142    return __trace_pid_start(key);
143}
144
145int trace_pid_start_tp(struct tp_args *args)
146{
147    struct hash_key key = {
148        .dev = args->dev,
149        .sector = args->sector
150    };
151
152    return __trace_pid_start(key);
153}
154
155// time block I/O
156int trace_req_start(struct pt_regs *ctx, struct request *req)
157{
158    struct hash_key key = {
159        .dev = ddevt(req->__RQ_DISK__),
160        .sector = req->__sector
161    };
162    struct start_req_t start_req = {
163        .ts = bpf_ktime_get_ns(),
164        .data_len = req->__data_len
165    };
166    start.update(&key, &start_req);
167    return 0;
168}
169
170// output
171static int __trace_req_completion(struct hash_key key)
172{
173    struct start_req_t *startp;
174
175    // fetch timestamp and calculate delta
176    startp = start.lookup(&key);
177    if (startp == 0) {
178        return 0;    // missed tracing issue
179    }
180
181    struct who_t *whop;
182    u32 pid;
183
184    whop = whobyreq.lookup(&key);
185    pid = whop != 0 ? whop->pid : 0;
186    if (FILTER_PID) {
187        start.delete(&key);
188        if (whop != 0) {
189            whobyreq.delete(&key);
190        }
191        return 0;
192    }
193
194    struct val_t *valp, zero = {};
195    u64 delta_us = (bpf_ktime_get_ns() - startp->ts) / 1000;
196
197    // setup info_t key
198    struct info_t info = {};
199    info.major = key.dev >> 20;
200    info.minor = key.dev & ((1 << 20) - 1);
201/*
202 * The following deals with a kernel version change (in mainline 4.7, although
203 * it may be backported to earlier kernels) with how block request write flags
204 * are tested. We handle both pre- and post-change versions here. Please avoid
205 * kernel version tests like this as much as possible: they inflate the code,
206 * test, and maintenance burden.
207 */
208/*#ifdef REQ_WRITE
209    info.rwflag = !!(req->cmd_flags & REQ_WRITE);
210#elif defined(REQ_OP_SHIFT)
211    info.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
212#else
213    info.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
214#endif*/
215
216    if (whop == 0) {
217        // missed pid who, save stats as pid 0
218        valp = counts.lookup_or_try_init(&info, &zero);
219    } else {
220        info.pid = whop->pid;
221        __builtin_memcpy(&info.name, whop->name, sizeof(info.name));
222        valp = counts.lookup_or_try_init(&info, &zero);
223    }
224
225    if (valp) {
226        // save stats
227        valp->us += delta_us;
228        valp->bytes += startp->data_len;
229        valp->io++;
230    }
231
232    start.delete(&key);
233    whobyreq.delete(&key);
234
235    return 0;
236}
237
238int trace_req_completion(struct pt_regs *ctx, struct request *req)
239{
240    struct hash_key key = {
241        .dev = ddevt(req->__RQ_DISK__),
242        .sector = req->__sector
243    };
244
245    return __trace_req_completion(key);
246}
247
248int trace_req_completion_tp(struct tp_args *args)
249{
250    struct hash_key key = {
251        .dev = args->dev,
252        .sector = args->sector
253    };
254
255    return __trace_req_completion(key);
256}
257"""
258
259if args.ebpf:
260    print(bpf_text)
261    exit()
262
263if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1:
264    bpf_text = bpf_text.replace('__RQ_DISK__', 'rq_disk')
265else:
266    bpf_text = bpf_text.replace('__RQ_DISK__', 'q->disk')
267
268if args.pid is not None:
269    bpf_text = bpf_text.replace('FILTER_PID', 'pid != %d' % args.pid)
270else:
271    bpf_text = bpf_text.replace('FILTER_PID', '0')
272
273b = BPF(text=bpf_text)
274if BPF.get_kprobe_functions(b'__blk_account_io_start'):
275    b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_pid_start")
276elif BPF.get_kprobe_functions(b'blk_account_io_start'):
277    b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
278else:
279    b.attach_tracepoint(tp="block:block_io_start", fn_name="trace_pid_start_tp")
280if BPF.get_kprobe_functions(b'blk_start_request'):
281    b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
282b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
283if BPF.get_kprobe_functions(b'__blk_account_io_done'):
284    b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_completion")
285elif BPF.get_kprobe_functions(b'blk_account_io_done'):
286    b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_completion")
287else:
288    b.attach_tracepoint(tp="block:block_io_done", fn_name="trace_req_completion_tp")
289
290print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval)
291
292# cache disk major,minor -> diskname
293disklookup = {}
294with open(diskstats) as stats:
295    for line in stats:
296        a = line.split()
297        disklookup[a[0] + "," + a[1]] = a[2]
298
299# output
300exiting = 0
301while 1:
302    try:
303        sleep(interval)
304    except KeyboardInterrupt:
305        exiting = 1
306
307    # header
308    if clear:
309        call("clear")
310    else:
311        print()
312    with open(loadavg) as stats:
313        print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read()))
314    print("%-7s %-16s %1s %-3s %-3s %-8s %5s %7s %6s" % ("PID", "COMM",
315        "D", "MAJ", "MIN", "DISK", "I/O", "Kbytes", "AVGms"))
316
317    # by-PID output
318    counts = b.get_table("counts")
319    line = 0
320    for k, v in reversed(sorted(counts.items(),
321                                key=lambda counts: counts[1].bytes)):
322
323        # lookup disk
324        disk = str(k.major) + "," + str(k.minor)
325        if disk in disklookup:
326            diskname = disklookup[disk]
327        else:
328            diskname = "?"
329
330        # print line
331        avg_ms = (float(v.us) / 1000) / v.io
332        print("%-7d %-16s %1s %-3d %-3d %-8s %5s %7s %6.2f" % (k.pid,
333            k.name.decode('utf-8', 'replace'), "W" if k.rwflag else "R",
334            k.major, k.minor, diskname, v.io, v.bytes / 1024, avg_ms))
335
336        line += 1
337        if line >= maxrows:
338            break
339    counts.clear()
340
341    countdown -= 1
342    if exiting or countdown == 0:
343        print("Detaching...")
344        exit()
345