1#!/usr/bin/env python 2# @lint-avoid-python-3-compatibility-imports 3# 4# biolatency Summarize block device I/O latency as a histogram. 5# For Linux, uses BCC, eBPF. 6# 7# USAGE: biolatency [-h] [-T] [-Q] [-m] [-D] [-F] [-e] [-j] [-d DISK] [interval] [count] 8# 9# Copyright (c) 2015 Brendan Gregg. 10# Licensed under the Apache License, Version 2.0 (the "License") 11# 12# 20-Sep-2015 Brendan Gregg Created this. 13# 31-Mar-2022 Rocky Xing Added disk filter support. 14# 01-Aug-2023 Jerome Marchand Added support for block tracepoints 15 16from __future__ import print_function 17from bcc import BPF 18from time import sleep, strftime 19import argparse 20import ctypes as ct 21import os 22 23# arguments 24examples = """examples: 25 ./biolatency # summarize block I/O latency as a histogram 26 ./biolatency 1 10 # print 1 second summaries, 10 times 27 ./biolatency -mT 1 # 1s summaries, milliseconds, and timestamps 28 ./biolatency -Q # include OS queued time in I/O time 29 ./biolatency -D # show each disk device separately 30 ./biolatency -F # show I/O flags separately 31 ./biolatency -j # print a dictionary 32 ./biolatency -e # show extension summary(total, average) 33 ./biolatency -d sdc # Trace sdc only 34""" 35parser = argparse.ArgumentParser( 36 description="Summarize block device I/O latency as a histogram", 37 formatter_class=argparse.RawDescriptionHelpFormatter, 38 epilog=examples) 39parser.add_argument("-T", "--timestamp", action="store_true", 40 help="include timestamp on output") 41parser.add_argument("-Q", "--queued", action="store_true", 42 help="include OS queued time in I/O time") 43parser.add_argument("-m", "--milliseconds", action="store_true", 44 help="millisecond histogram") 45parser.add_argument("-D", "--disks", action="store_true", 46 help="print a histogram per disk device") 47parser.add_argument("-F", "--flags", action="store_true", 48 help="print a histogram per set of I/O flags") 49parser.add_argument("-e", "--extension", action="store_true", 50 help="summarize average/total value") 51parser.add_argument("interval", nargs="?", default=99999999, 52 help="output interval, in seconds") 53parser.add_argument("count", nargs="?", default=99999999, 54 help="number of outputs") 55parser.add_argument("--ebpf", action="store_true", 56 help=argparse.SUPPRESS) 57parser.add_argument("-j", "--json", action="store_true", 58 help="json output") 59parser.add_argument("-d", "--disk", type=str, 60 help="Trace this disk only") 61 62args = parser.parse_args() 63countdown = int(args.count) 64debug = 0 65 66if args.flags and args.disks: 67 print("ERROR: can only use -D or -F. Exiting.") 68 exit() 69 70# define BPF program 71bpf_text = """ 72#include <uapi/linux/ptrace.h> 73#include <linux/blk-mq.h> 74 75typedef struct disk_key { 76 dev_t dev; 77 u64 slot; 78} disk_key_t; 79 80typedef struct flag_key { 81 u64 flags; 82 u64 slot; 83} flag_key_t; 84 85typedef struct ext_val { 86 u64 total; 87 u64 count; 88} ext_val_t; 89 90struct tp_args { 91 u64 __unused__; 92 dev_t dev; 93 sector_t sector; 94 unsigned int nr_sector; 95 unsigned int bytes; 96 char rwbs[8]; 97 char comm[16]; 98 char cmd[]; 99}; 100 101struct start_key { 102 dev_t dev; 103 u32 _pad; 104 sector_t sector; 105 CMD_FLAGS 106}; 107 108BPF_HASH(start, struct start_key); 109STORAGE 110 111static dev_t ddevt(struct gendisk *disk) { 112 return (disk->major << 20) | disk->first_minor; 113} 114 115// time block I/O 116static int __trace_req_start(struct start_key key) 117{ 118 DISK_FILTER 119 120 u64 ts = bpf_ktime_get_ns(); 121 start.update(&key, &ts); 122 return 0; 123} 124 125int trace_req_start(struct pt_regs *ctx, struct request *req) 126{ 127 struct start_key key = { 128 .dev = ddevt(req->__RQ_DISK__), 129 .sector = req->__sector 130 }; 131 132 SET_FLAGS 133 134 return __trace_req_start(key); 135} 136 137int trace_req_start_tp(struct tp_args *args) 138{ 139 struct start_key key = { 140 .dev = args->dev, 141 .sector = args->sector 142 }; 143 144 return __trace_req_start(key); 145} 146 147// output 148static int __trace_req_done(struct start_key key) 149{ 150 u64 *tsp, delta; 151 152 // fetch timestamp and calculate delta 153 tsp = start.lookup(&key); 154 if (tsp == 0) { 155 return 0; // missed issue 156 } 157 delta = bpf_ktime_get_ns() - *tsp; 158 159 FACTOR 160 161 // store as histogram 162 STORE 163 164 start.delete(&key); 165 return 0; 166} 167 168int trace_req_done(struct pt_regs *ctx, struct request *req) 169{ 170 struct start_key key = { 171 .dev = ddevt(req->__RQ_DISK__), 172 .sector = req->__sector 173 }; 174 175 SET_FLAGS 176 177 return __trace_req_done(key); 178} 179 180int trace_req_done_tp(struct tp_args *args) 181{ 182 struct start_key key = { 183 .dev = args->dev, 184 .sector = args->sector 185 }; 186 187 return __trace_req_done(key); 188} 189""" 190 191# code substitutions 192if args.milliseconds: 193 bpf_text = bpf_text.replace('FACTOR', 'delta /= 1000000;') 194 label = "msecs" 195else: 196 bpf_text = bpf_text.replace('FACTOR', 'delta /= 1000;') 197 label = "usecs" 198 199storage_str = "" 200store_str = "" 201if args.disks: 202 storage_str += "BPF_HISTOGRAM(dist, disk_key_t);" 203 disks_str = """ 204 disk_key_t dkey = {}; 205 dkey.dev = key.dev; 206 dkey.slot = bpf_log2l(delta); 207 dist.atomic_increment(dkey); 208 """ 209 store_str += disks_str 210elif args.flags: 211 storage_str += "BPF_HISTOGRAM(dist, flag_key_t);" 212 store_str += """ 213 flag_key_t fkey = {.slot = bpf_log2l(delta)}; 214 fkey.flags = key.flags; 215 dist.atomic_increment(fkey); 216 """ 217else: 218 storage_str += "BPF_HISTOGRAM(dist);" 219 store_str += "dist.atomic_increment(bpf_log2l(delta));" 220 221if args.disk is not None: 222 disk_path = os.path.join('/dev', args.disk) 223 if not os.path.exists(disk_path): 224 print("no such disk '%s'" % args.disk) 225 exit(1) 226 227 stat_info = os.stat(disk_path) 228 dev = os.major(stat_info.st_rdev) << 20 | os.minor(stat_info.st_rdev) 229 230 disk_filter_str = """ 231 if(key.dev != %s) { 232 return 0; 233 } 234 """ % (dev) 235 236 bpf_text = bpf_text.replace('DISK_FILTER', disk_filter_str) 237else: 238 bpf_text = bpf_text.replace('DISK_FILTER', '') 239 240if args.extension: 241 storage_str += "BPF_ARRAY(extension, ext_val_t, 1);" 242 store_str += """ 243 u32 index = 0; 244 ext_val_t *ext_val = extension.lookup(&index); 245 if (ext_val) { 246 lock_xadd(&ext_val->total, delta); 247 lock_xadd(&ext_val->count, 1); 248 } 249 """ 250 251bpf_text = bpf_text.replace("STORAGE", storage_str) 252bpf_text = bpf_text.replace("STORE", store_str) 253if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1: 254 bpf_text = bpf_text.replace('__RQ_DISK__', 'rq_disk') 255else: 256 bpf_text = bpf_text.replace('__RQ_DISK__', 'q->disk') 257if args.flags: 258 bpf_text = bpf_text.replace('CMD_FLAGS', 'u64 flags;') 259 bpf_text = bpf_text.replace('SET_FLAGS', 'key.flags = req->cmd_flags;') 260else: 261 bpf_text = bpf_text.replace('CMD_FLAGS', '') 262 bpf_text = bpf_text.replace('SET_FLAGS', '') 263 264if debug or args.ebpf: 265 print(bpf_text) 266 if args.ebpf: 267 exit() 268 269# load BPF program 270b = BPF(text=bpf_text) 271if args.queued: 272 if BPF.get_kprobe_functions(b'__blk_account_io_start'): 273 b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_req_start") 274 elif BPF.get_kprobe_functions(b'blk_account_io_start'): 275 b.attach_kprobe(event="blk_account_io_start", fn_name="trace_req_start") 276 else: 277 if args.flags: 278 # Some flags are accessible in the rwbs field (RAHEAD, SYNC and META) 279 # but other aren't. Disable the -F option for tracepoint for now. 280 print("ERROR: blk_account_io_start probe not available. Can't use -F.") 281 exit() 282 b.attach_tracepoint(tp="block:block_io_start", fn_name="trace_req_start_tp") 283else: 284 if BPF.get_kprobe_functions(b'blk_start_request'): 285 b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start") 286 b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start") 287 288if BPF.get_kprobe_functions(b'__blk_account_io_done'): 289 b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_done") 290elif BPF.get_kprobe_functions(b'blk_account_io_done'): 291 b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_done") 292else: 293 if args.flags: 294 print("ERROR: blk_account_io_done probe not available. Can't use -F.") 295 exit() 296 b.attach_tracepoint(tp="block:block_io_done", fn_name="trace_req_done_tp") 297 298 299if not args.json: 300 print("Tracing block device I/O... Hit Ctrl-C to end.") 301 302# cache disk major,minor -> diskname 303diskstats = "/proc/diskstats" 304disklookup = {} 305with open(diskstats) as stats: 306 for line in stats: 307 a = line.split() 308 disklookup[a[0] + "," + a[1]] = a[2] 309 310def disk_print(d): 311 major = d >> 20 312 minor = d & ((1 << 20) - 1) 313 314 disk = str(major) + "," + str(minor) 315 if disk in disklookup: 316 diskname = disklookup[disk] 317 else: 318 diskname = "?" 319 320 return diskname 321 322# see blk_fill_rwbs(): 323req_opf = { 324 0: "Read", 325 1: "Write", 326 2: "Flush", 327 3: "Discard", 328 5: "SecureErase", 329 6: "ZoneReset", 330 7: "WriteSame", 331 9: "WriteZeros" 332} 333REQ_OP_BITS = 8 334REQ_OP_MASK = ((1 << REQ_OP_BITS) - 1) 335REQ_SYNC = 1 << (REQ_OP_BITS + 3) 336REQ_META = 1 << (REQ_OP_BITS + 4) 337REQ_PRIO = 1 << (REQ_OP_BITS + 5) 338REQ_NOMERGE = 1 << (REQ_OP_BITS + 6) 339REQ_IDLE = 1 << (REQ_OP_BITS + 7) 340REQ_FUA = 1 << (REQ_OP_BITS + 9) 341REQ_RAHEAD = 1 << (REQ_OP_BITS + 11) 342REQ_BACKGROUND = 1 << (REQ_OP_BITS + 12) 343REQ_NOWAIT = 1 << (REQ_OP_BITS + 13) 344def flags_print(flags): 345 desc = "" 346 # operation 347 if flags & REQ_OP_MASK in req_opf: 348 desc = req_opf[flags & REQ_OP_MASK] 349 else: 350 desc = "Unknown" 351 # flags 352 if flags & REQ_SYNC: 353 desc = "Sync-" + desc 354 if flags & REQ_META: 355 desc = "Metadata-" + desc 356 if flags & REQ_FUA: 357 desc = "ForcedUnitAccess-" + desc 358 if flags & REQ_PRIO: 359 desc = "Priority-" + desc 360 if flags & REQ_NOMERGE: 361 desc = "NoMerge-" + desc 362 if flags & REQ_IDLE: 363 desc = "Idle-" + desc 364 if flags & REQ_RAHEAD: 365 desc = "ReadAhead-" + desc 366 if flags & REQ_BACKGROUND: 367 desc = "Background-" + desc 368 if flags & REQ_NOWAIT: 369 desc = "NoWait-" + desc 370 return desc 371 372# output 373exiting = 0 if args.interval else 1 374dist = b.get_table("dist") 375if args.extension: 376 extension = b.get_table("extension") 377while (1): 378 try: 379 sleep(int(args.interval)) 380 except KeyboardInterrupt: 381 exiting = 1 382 383 print() 384 if args.json: 385 if args.timestamp: 386 print("%-8s\n" % strftime("%H:%M:%S"), end="") 387 388 if args.flags: 389 dist.print_json_hist(label, "flags", flags_print) 390 else: 391 dist.print_json_hist(label, "disk", disk_print) 392 393 else: 394 if args.timestamp: 395 print("%-8s\n" % strftime("%H:%M:%S"), end="") 396 397 if args.flags: 398 dist.print_log2_hist(label, "flags", flags_print) 399 else: 400 dist.print_log2_hist(label, "disk", disk_print) 401 if args.extension: 402 total = extension[0].total 403 count = extension[0].count 404 if count > 0: 405 print("\navg = %ld %s, total: %ld %s, count: %ld\n" % 406 (total / count, label, total, label, count)) 407 extension.clear() 408 409 dist.clear() 410 411 countdown -= 1 412 if exiting or countdown == 0: 413 exit() 414