1#!/usr/bin/env python 2# @lint-avoid-python-3-compatibility-imports 3# 4# biotop block device (disk) I/O by process. 5# For Linux, uses BCC, eBPF. 6# 7# USAGE: biotop.py [-h] [-C] [-r MAXROWS] [-p PID] [interval] [count] 8# 9# This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O 10# request, as well as a starting timestamp for calculating I/O latency. 11# 12# Copyright 2016 Netflix, Inc. 13# Licensed under the Apache License, Version 2.0 (the "License") 14# 15# 06-Feb-2016 Brendan Gregg Created this. 16# 17-Mar-2022 Rocky Xing Added PID filter support. 17# 01-Aug-2023 Jerome Marchand Added support for block tracepoints 18 19from __future__ import print_function 20from bcc import BPF 21from time import sleep, strftime 22import argparse 23from subprocess import call 24 25# arguments 26examples = """examples: 27 ./biotop # block device I/O top, 1 second refresh 28 ./biotop -C # don't clear the screen 29 ./biotop -p 181 # only trace PID 181 30 ./biotop 5 # 5 second summaries 31 ./biotop 5 10 # 5 second summaries, 10 times only 32""" 33parser = argparse.ArgumentParser( 34 description="Block device (disk) I/O by process", 35 formatter_class=argparse.RawDescriptionHelpFormatter, 36 epilog=examples) 37parser.add_argument("-C", "--noclear", action="store_true", 38 help="don't clear the screen") 39parser.add_argument("-r", "--maxrows", default=20, 40 help="maximum rows to print, default 20") 41parser.add_argument("-p", "--pid", type=int, metavar="PID", 42 help="trace this PID only") 43parser.add_argument("interval", nargs="?", default=1, 44 help="output interval, in seconds") 45parser.add_argument("count", nargs="?", default=99999999, 46 help="number of outputs") 47parser.add_argument("--ebpf", action="store_true", 48 help=argparse.SUPPRESS) 49args = parser.parse_args() 50interval = int(args.interval) 51countdown = int(args.count) 52maxrows = int(args.maxrows) 53clear = not int(args.noclear) 54 55# linux stats 56loadavg = "/proc/loadavg" 57diskstats = "/proc/diskstats" 58 59# load BPF program 60bpf_text = """ 61#include <uapi/linux/ptrace.h> 62#include <linux/blk-mq.h> 63 64// for saving the timestamp and __data_len of each request 65struct start_req_t { 66 u64 ts; 67 u64 data_len; 68}; 69 70// for saving process info by request 71struct who_t { 72 u32 pid; 73 char name[TASK_COMM_LEN]; 74}; 75 76// the key for the output summary 77struct info_t { 78 u32 pid; 79 int rwflag; 80 int major; 81 int minor; 82 char name[TASK_COMM_LEN]; 83}; 84 85// the value of the output summary 86struct val_t { 87 u64 bytes; 88 u64 us; 89 u32 io; 90}; 91 92struct tp_args { 93 u64 __unused__; 94 dev_t dev; 95 sector_t sector; 96 unsigned int nr_sector; 97 unsigned int bytes; 98 char rwbs[8]; 99 char comm[16]; 100 char cmd[]; 101}; 102 103struct hash_key { 104 dev_t dev; 105 u32 _pad; 106 sector_t sector; 107}; 108 109BPF_HASH(start, struct hash_key, struct start_req_t); 110BPF_HASH(whobyreq, struct hash_key, struct who_t); 111BPF_HASH(counts, struct info_t, struct val_t); 112 113static dev_t ddevt(struct gendisk *disk) { 114 return (disk->major << 20) | disk->first_minor; 115} 116 117// cache PID and comm by-req 118static int __trace_pid_start(struct hash_key key) 119{ 120 struct who_t who; 121 u32 pid; 122 123 if (bpf_get_current_comm(&who.name, sizeof(who.name)) == 0) { 124 pid = bpf_get_current_pid_tgid() >> 32; 125 if (FILTER_PID) 126 return 0; 127 128 who.pid = pid; 129 whobyreq.update(&key, &who); 130 } 131 132 return 0; 133} 134 135int trace_pid_start(struct pt_regs *ctx, struct request *req) 136{ 137 struct hash_key key = { 138 .dev = ddevt(req->__RQ_DISK__), 139 .sector = req->__sector 140 }; 141 142 return __trace_pid_start(key); 143} 144 145int trace_pid_start_tp(struct tp_args *args) 146{ 147 struct hash_key key = { 148 .dev = args->dev, 149 .sector = args->sector 150 }; 151 152 return __trace_pid_start(key); 153} 154 155// time block I/O 156int trace_req_start(struct pt_regs *ctx, struct request *req) 157{ 158 struct hash_key key = { 159 .dev = ddevt(req->__RQ_DISK__), 160 .sector = req->__sector 161 }; 162 struct start_req_t start_req = { 163 .ts = bpf_ktime_get_ns(), 164 .data_len = req->__data_len 165 }; 166 start.update(&key, &start_req); 167 return 0; 168} 169 170// output 171static int __trace_req_completion(struct hash_key key) 172{ 173 struct start_req_t *startp; 174 175 // fetch timestamp and calculate delta 176 startp = start.lookup(&key); 177 if (startp == 0) { 178 return 0; // missed tracing issue 179 } 180 181 struct who_t *whop; 182 u32 pid; 183 184 whop = whobyreq.lookup(&key); 185 pid = whop != 0 ? whop->pid : 0; 186 if (FILTER_PID) { 187 start.delete(&key); 188 if (whop != 0) { 189 whobyreq.delete(&key); 190 } 191 return 0; 192 } 193 194 struct val_t *valp, zero = {}; 195 u64 delta_us = (bpf_ktime_get_ns() - startp->ts) / 1000; 196 197 // setup info_t key 198 struct info_t info = {}; 199 info.major = key.dev >> 20; 200 info.minor = key.dev & ((1 << 20) - 1); 201/* 202 * The following deals with a kernel version change (in mainline 4.7, although 203 * it may be backported to earlier kernels) with how block request write flags 204 * are tested. We handle both pre- and post-change versions here. Please avoid 205 * kernel version tests like this as much as possible: they inflate the code, 206 * test, and maintenance burden. 207 */ 208/*#ifdef REQ_WRITE 209 info.rwflag = !!(req->cmd_flags & REQ_WRITE); 210#elif defined(REQ_OP_SHIFT) 211 info.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE); 212#else 213 info.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE); 214#endif*/ 215 216 if (whop == 0) { 217 // missed pid who, save stats as pid 0 218 valp = counts.lookup_or_try_init(&info, &zero); 219 } else { 220 info.pid = whop->pid; 221 __builtin_memcpy(&info.name, whop->name, sizeof(info.name)); 222 valp = counts.lookup_or_try_init(&info, &zero); 223 } 224 225 if (valp) { 226 // save stats 227 valp->us += delta_us; 228 valp->bytes += startp->data_len; 229 valp->io++; 230 } 231 232 start.delete(&key); 233 whobyreq.delete(&key); 234 235 return 0; 236} 237 238int trace_req_completion(struct pt_regs *ctx, struct request *req) 239{ 240 struct hash_key key = { 241 .dev = ddevt(req->__RQ_DISK__), 242 .sector = req->__sector 243 }; 244 245 return __trace_req_completion(key); 246} 247 248int trace_req_completion_tp(struct tp_args *args) 249{ 250 struct hash_key key = { 251 .dev = args->dev, 252 .sector = args->sector 253 }; 254 255 return __trace_req_completion(key); 256} 257""" 258 259if args.ebpf: 260 print(bpf_text) 261 exit() 262 263if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1: 264 bpf_text = bpf_text.replace('__RQ_DISK__', 'rq_disk') 265else: 266 bpf_text = bpf_text.replace('__RQ_DISK__', 'q->disk') 267 268if args.pid is not None: 269 bpf_text = bpf_text.replace('FILTER_PID', 'pid != %d' % args.pid) 270else: 271 bpf_text = bpf_text.replace('FILTER_PID', '0') 272 273b = BPF(text=bpf_text) 274if BPF.get_kprobe_functions(b'__blk_account_io_start'): 275 b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_pid_start") 276elif BPF.get_kprobe_functions(b'blk_account_io_start'): 277 b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start") 278else: 279 b.attach_tracepoint(tp="block:block_io_start", fn_name="trace_pid_start_tp") 280if BPF.get_kprobe_functions(b'blk_start_request'): 281 b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start") 282b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start") 283if BPF.get_kprobe_functions(b'__blk_account_io_done'): 284 b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_completion") 285elif BPF.get_kprobe_functions(b'blk_account_io_done'): 286 b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_completion") 287else: 288 b.attach_tracepoint(tp="block:block_io_done", fn_name="trace_req_completion_tp") 289 290print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval) 291 292# cache disk major,minor -> diskname 293disklookup = {} 294with open(diskstats) as stats: 295 for line in stats: 296 a = line.split() 297 disklookup[a[0] + "," + a[1]] = a[2] 298 299# output 300exiting = 0 301while 1: 302 try: 303 sleep(interval) 304 except KeyboardInterrupt: 305 exiting = 1 306 307 # header 308 if clear: 309 call("clear") 310 else: 311 print() 312 with open(loadavg) as stats: 313 print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read())) 314 print("%-7s %-16s %1s %-3s %-3s %-8s %5s %7s %6s" % ("PID", "COMM", 315 "D", "MAJ", "MIN", "DISK", "I/O", "Kbytes", "AVGms")) 316 317 # by-PID output 318 counts = b.get_table("counts") 319 line = 0 320 for k, v in reversed(sorted(counts.items(), 321 key=lambda counts: counts[1].bytes)): 322 323 # lookup disk 324 disk = str(k.major) + "," + str(k.minor) 325 if disk in disklookup: 326 diskname = disklookup[disk] 327 else: 328 diskname = "?" 329 330 # print line 331 avg_ms = (float(v.us) / 1000) / v.io 332 print("%-7d %-16s %1s %-3d %-3d %-8s %5s %7s %6.2f" % (k.pid, 333 k.name.decode('utf-8', 'replace'), "W" if k.rwflag else "R", 334 k.major, k.minor, diskname, v.io, v.bytes / 1024, avg_ms)) 335 336 line += 1 337 if line >= maxrows: 338 break 339 counts.clear() 340 341 countdown -= 1 342 if exiting or countdown == 0: 343 print("Detaching...") 344 exit() 345