1#!/usr/bin/env python 2# 3# biolatpcts.py Monitor IO latency distribution of a block device. 4# 5# $ ./biolatpcts.py /dev/nvme0n1 6# nvme0n1 p1 p5 p10 p16 p25 p50 p75 p84 p90 p95 p99 p100 7# read 95us 175us 305us 515us 895us 985us 995us 1.5ms 2.5ms 3.5ms 4.5ms 10ms 8# write 5us 5us 5us 15us 25us 135us 765us 855us 885us 895us 965us 1.5ms 9# discard 5us 5us 5us 5us 135us 145us 165us 205us 385us 875us 1.5ms 2.5ms 10# flush 5us 5us 5us 5us 5us 5us 5us 5us 5us 1.5ms 4.5ms 5.5ms 11# 12# Copyright (C) 2020 Tejun Heo <[email protected]> 13# Copyright (C) 2020 Facebook 14 15from __future__ import print_function 16from bcc import BPF 17from time import sleep 18from threading import Event 19import argparse 20import json 21import sys 22import os 23import signal 24 25description = """ 26Monitor IO latency distribution of a block device 27""" 28 29epilog = """ 30When interval is infinite, biolatpcts will print out result once the 31initialization is complete to indicate readiness. After initialized, 32biolatpcts will output whenever it receives SIGUSR1/2 and before exiting on 33SIGINT, SIGTERM or SIGHUP. 34 35SIGUSR1 starts a new period after reporting. SIGUSR2 doesn't and can be used 36to monitor progress without affecting accumulation of data points. They can 37be used to obtain latency distribution between two arbitrary events and 38monitor progress inbetween. 39""" 40 41parser = argparse.ArgumentParser(description = description, epilog = epilog, 42 formatter_class = argparse.ArgumentDefaultsHelpFormatter) 43parser.add_argument('dev', metavar='DEV', type=str, 44 help='Target block device (/dev/DEVNAME, DEVNAME or MAJ:MIN)') 45parser.add_argument('-i', '--interval', type=int, default=3, 46 help='Report interval (0: exit after startup, -1: infinite)') 47parser.add_argument('-w', '--which', choices=['from-rq-alloc', 'after-rq-alloc', 'on-device'], 48 default='on-device', help='Which latency to measure') 49parser.add_argument('-p', '--pcts', metavar='PCT,...', type=str, 50 default='1,5,10,16,25,50,75,84,90,95,99,100', 51 help='Percentiles to calculate') 52parser.add_argument('-j', '--json', action='store_true', 53 help='Output in json') 54parser.add_argument('--verbose', '-v', action='count', default = 0) 55 56bpf_source = """ 57#include <linux/blk_types.h> 58#include <linux/blkdev.h> 59#include <linux/blk-mq.h> 60#include <linux/time64.h> 61 62BPF_PERCPU_ARRAY(rwdf_100ms, u64, 400); 63BPF_PERCPU_ARRAY(rwdf_1ms, u64, 400); 64BPF_PERCPU_ARRAY(rwdf_10us, u64, 400); 65 66RAW_TRACEPOINT_PROBE(block_rq_complete) 67{ 68 // TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes) 69 struct request *rq = (void *)ctx->args[0]; 70 unsigned int cmd_flags; 71 u64 dur; 72 size_t base, slot; 73 74 if (!rq->__START_TIME_FIELD__) 75 return 0; 76 77 if (!rq->__RQ_DISK__ || 78 rq->__RQ_DISK__->major != __MAJOR__ || 79 rq->__RQ_DISK__->first_minor != __MINOR__) 80 return 0; 81 82 cmd_flags = rq->cmd_flags; 83 switch (cmd_flags & REQ_OP_MASK) { 84 case REQ_OP_READ: 85 base = 0; 86 break; 87 case REQ_OP_WRITE: 88 base = 100; 89 break; 90 case REQ_OP_DISCARD: 91 base = 200; 92 break; 93 case REQ_OP_FLUSH: 94 base = 300; 95 break; 96 default: 97 return 0; 98 } 99 100 dur = bpf_ktime_get_ns() - rq->__START_TIME_FIELD__; 101 102 slot = min_t(size_t, div_u64(dur, 100 * NSEC_PER_MSEC), 99); 103 rwdf_100ms.increment(base + slot); 104 if (slot) 105 return 0; 106 107 slot = min_t(size_t, div_u64(dur, NSEC_PER_MSEC), 99); 108 rwdf_1ms.increment(base + slot); 109 if (slot) 110 return 0; 111 112 slot = min_t(size_t, div_u64(dur, 10 * NSEC_PER_USEC), 99); 113 rwdf_10us.increment(base + slot); 114 return 0; 115} 116""" 117 118args = parser.parse_args() 119args.pcts = args.pcts.split(',') 120args.pcts.sort(key=lambda x: float(x)) 121 122try: 123 major = int(args.dev.split(':')[0]) 124 minor = int(args.dev.split(':')[1]) 125except Exception: 126 if '/' in args.dev: 127 stat = os.stat(args.dev) 128 else: 129 stat = os.stat('/dev/' + args.dev) 130 131 major = os.major(stat.st_rdev) 132 minor = os.minor(stat.st_rdev) 133 134if args.which == 'from-rq-alloc': 135 start_time_field = 'alloc_time_ns' 136elif args.which == 'after-rq-alloc': 137 start_time_field = 'start_time_ns' 138elif args.which == 'on-device': 139 start_time_field = 'io_start_time_ns' 140else: 141 print("Invalid latency measurement {}".format(args.which)) 142 exit() 143 144bpf_source = bpf_source.replace('__START_TIME_FIELD__', start_time_field) 145bpf_source = bpf_source.replace('__MAJOR__', str(major)) 146bpf_source = bpf_source.replace('__MINOR__', str(minor)) 147 148if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1: 149 bpf_source = bpf_source.replace('__RQ_DISK__', 'rq_disk') 150else: 151 bpf_source = bpf_source.replace('__RQ_DISK__', 'q->disk') 152 153bpf = BPF(text=bpf_source) 154 155# times are in usecs 156MSEC = 1000 157SEC = 1000 * 1000 158 159cur_rwdf_100ms = bpf["rwdf_100ms"] 160cur_rwdf_1ms = bpf["rwdf_1ms"] 161cur_rwdf_10us = bpf["rwdf_10us"] 162 163last_rwdf_100ms = [0] * 400 164last_rwdf_1ms = [0] * 400 165last_rwdf_10us = [0] * 400 166 167rwdf_100ms = [0] * 400 168rwdf_1ms = [0] * 400 169rwdf_10us = [0] * 400 170 171io_type = ["read", "write", "discard", "flush"] 172 173def find_pct(req, total, slots, idx, counted): 174 while idx > 0: 175 idx -= 1 176 if slots[idx] > 0: 177 counted += slots[idx] 178 if args.verbose > 1: 179 print('idx={} counted={} pct={:.1f} req={}' 180 .format(idx, counted, counted / total, req)) 181 if (counted / total) * 100 >= 100 - req: 182 break 183 return (idx, counted) 184 185def calc_lat_pct(req_pcts, total, lat_100ms, lat_1ms, lat_10us): 186 pcts = [0] * len(req_pcts) 187 188 if total == 0: 189 return pcts 190 191 data = [(100 * MSEC, lat_100ms), (MSEC, lat_1ms), (10, lat_10us)] 192 data_sel = 0 193 idx = 100 194 counted = 0 195 196 for pct_idx in reversed(range(len(req_pcts))): 197 req = float(req_pcts[pct_idx]) 198 while True: 199 last_counted = counted 200 (gran, slots) = data[data_sel] 201 (idx, counted) = find_pct(req, total, slots, idx, counted) 202 if args.verbose > 1: 203 print('pct_idx={} req={} gran={} idx={} counted={} total={}' 204 .format(pct_idx, req, gran, idx, counted, total)) 205 if idx > 0 or data_sel == len(data) - 1: 206 break 207 counted = last_counted 208 data_sel += 1 209 idx = 100 210 211 pcts[pct_idx] = gran * idx + gran / 2 212 213 return pcts 214 215def format_usec(lat): 216 if lat > SEC: 217 return '{:.1f}s'.format(lat / SEC) 218 elif lat > 10 * MSEC: 219 return '{:.0f}ms'.format(lat / MSEC) 220 elif lat > MSEC: 221 return '{:.1f}ms'.format(lat / MSEC) 222 elif lat > 0: 223 return '{:.0f}us'.format(lat) 224 else: 225 return '-' 226 227# 0 interval can be used to test whether this script would run successfully. 228if args.interval == 0: 229 sys.exit(0) 230 231# Set up signal handling so that we print the result on USR1/2 and before 232# exiting on a signal. Combined with infinite interval, this can be used to 233# obtain overall latency distribution between two events. On USR2 the 234# accumulated counters are cleared too, which can be used to define 235# arbitrary intervals. 236force_update_last_rwdf = False 237keep_running = True 238result_req = Event() 239def sig_handler(sig, frame): 240 global keep_running, force_update_last_rwdf, result_req 241 if sig == signal.SIGUSR1: 242 force_update_last_rwdf = True 243 elif sig != signal.SIGUSR2: 244 keep_running = False 245 result_req.set() 246 247for sig in (signal.SIGUSR1, signal.SIGUSR2, signal.SIGINT, signal.SIGTERM, signal.SIGHUP): 248 signal.signal(sig, sig_handler) 249 250# If infinite interval, always trigger the first output so that the caller 251# can tell when initialization is complete. 252if args.interval < 0: 253 result_req.set(); 254 255while keep_running: 256 result_req.wait(args.interval if args.interval > 0 else None) 257 result_req.clear() 258 259 update_last_rwdf = args.interval > 0 or force_update_last_rwdf 260 force_update_last_rwdf = False 261 rwdf_total = [0] * 4; 262 263 for i in range(400): 264 v = cur_rwdf_100ms.sum(i).value 265 rwdf_100ms[i] = max(v - last_rwdf_100ms[i], 0) 266 if update_last_rwdf: 267 last_rwdf_100ms[i] = v 268 269 v = cur_rwdf_1ms.sum(i).value 270 rwdf_1ms[i] = max(v - last_rwdf_1ms[i], 0) 271 if update_last_rwdf: 272 last_rwdf_1ms[i] = v 273 274 v = cur_rwdf_10us.sum(i).value 275 rwdf_10us[i] = max(v - last_rwdf_10us[i], 0) 276 if update_last_rwdf: 277 last_rwdf_10us[i] = v 278 279 rwdf_total[int(i / 100)] += rwdf_100ms[i] 280 281 rwdf_lat = [] 282 for i in range(4): 283 left = i * 100 284 right = left + 100 285 rwdf_lat.append( 286 calc_lat_pct(args.pcts, rwdf_total[i], 287 rwdf_100ms[left:right], 288 rwdf_1ms[left:right], 289 rwdf_10us[left:right])) 290 291 if args.verbose: 292 print('{:7} 100ms {}'.format(io_type[i], rwdf_100ms[left:right])) 293 print('{:7} 1ms {}'.format(io_type[i], rwdf_1ms[left:right])) 294 print('{:7} 10us {}'.format(io_type[i], rwdf_10us[left:right])) 295 296 if args.json: 297 result = {} 298 for iot in range(4): 299 lats = {} 300 for pi in range(len(args.pcts)): 301 lats[args.pcts[pi]] = rwdf_lat[iot][pi] / SEC 302 result[io_type[iot]] = lats 303 print(json.dumps(result), flush=True) 304 else: 305 print('\n{:<7}'.format(os.path.basename(args.dev)), end='') 306 widths = [] 307 for pct in args.pcts: 308 widths.append(max(len(pct), 5)) 309 print(' {:>5}'.format(pct), end='') 310 print() 311 for iot in range(4): 312 print('{:7}'.format(io_type[iot]), end='') 313 for pi in range(len(rwdf_lat[iot])): 314 print(' {:>{}}'.format(format_usec(rwdf_lat[iot][pi]), widths[pi]), end='') 315 print() 316