xref: /aosp_15_r20/external/bcc/tools/biolatpcts.py (revision 387f9dfdfa2baef462e92476d413c7bc2470293e)
1#!/usr/bin/env python
2#
3# biolatpcts.py  Monitor IO latency distribution of a block device.
4#
5#  $ ./biolatpcts.py /dev/nvme0n1
6#  nvme0n1    p1    p5   p10   p16   p25   p50   p75   p84   p90   p95   p99  p100
7#  read     95us 175us 305us 515us 895us 985us 995us 1.5ms 2.5ms 3.5ms 4.5ms  10ms
8#  write     5us   5us   5us  15us  25us 135us 765us 855us 885us 895us 965us 1.5ms
9#  discard   5us   5us   5us   5us 135us 145us 165us 205us 385us 875us 1.5ms 2.5ms
10#  flush     5us   5us   5us   5us   5us   5us   5us   5us   5us 1.5ms 4.5ms 5.5ms
11#
12# Copyright (C) 2020 Tejun Heo <[email protected]>
13# Copyright (C) 2020 Facebook
14
15from __future__ import print_function
16from bcc import BPF
17from time import sleep
18from threading import Event
19import argparse
20import json
21import sys
22import os
23import signal
24
25description = """
26Monitor IO latency distribution of a block device
27"""
28
29epilog = """
30When interval is infinite, biolatpcts will print out result once the
31initialization is complete to indicate readiness. After initialized,
32biolatpcts will output whenever it receives SIGUSR1/2 and before exiting on
33SIGINT, SIGTERM or SIGHUP.
34
35SIGUSR1 starts a new period after reporting. SIGUSR2 doesn't and can be used
36to monitor progress without affecting accumulation of data points. They can
37be used to obtain latency distribution between two arbitrary events and
38monitor progress inbetween.
39"""
40
41parser = argparse.ArgumentParser(description = description, epilog = epilog,
42                                 formatter_class = argparse.ArgumentDefaultsHelpFormatter)
43parser.add_argument('dev', metavar='DEV', type=str,
44                    help='Target block device (/dev/DEVNAME, DEVNAME or MAJ:MIN)')
45parser.add_argument('-i', '--interval', type=int, default=3,
46                    help='Report interval (0: exit after startup, -1: infinite)')
47parser.add_argument('-w', '--which', choices=['from-rq-alloc', 'after-rq-alloc', 'on-device'],
48                    default='on-device', help='Which latency to measure')
49parser.add_argument('-p', '--pcts', metavar='PCT,...', type=str,
50                    default='1,5,10,16,25,50,75,84,90,95,99,100',
51                    help='Percentiles to calculate')
52parser.add_argument('-j', '--json', action='store_true',
53                    help='Output in json')
54parser.add_argument('--verbose', '-v', action='count', default = 0)
55
56bpf_source = """
57#include <linux/blk_types.h>
58#include <linux/blkdev.h>
59#include <linux/blk-mq.h>
60#include <linux/time64.h>
61
62BPF_PERCPU_ARRAY(rwdf_100ms, u64, 400);
63BPF_PERCPU_ARRAY(rwdf_1ms, u64, 400);
64BPF_PERCPU_ARRAY(rwdf_10us, u64, 400);
65
66RAW_TRACEPOINT_PROBE(block_rq_complete)
67{
68        // TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes)
69        struct request *rq = (void *)ctx->args[0];
70        unsigned int cmd_flags;
71        u64 dur;
72        size_t base, slot;
73
74        if (!rq->__START_TIME_FIELD__)
75                return 0;
76
77        if (!rq->__RQ_DISK__ ||
78            rq->__RQ_DISK__->major != __MAJOR__ ||
79            rq->__RQ_DISK__->first_minor != __MINOR__)
80                return 0;
81
82        cmd_flags = rq->cmd_flags;
83        switch (cmd_flags & REQ_OP_MASK) {
84        case REQ_OP_READ:
85                base = 0;
86                break;
87        case REQ_OP_WRITE:
88                base = 100;
89                break;
90        case REQ_OP_DISCARD:
91                base = 200;
92                break;
93        case REQ_OP_FLUSH:
94                base = 300;
95                break;
96        default:
97                return 0;
98        }
99
100        dur = bpf_ktime_get_ns() - rq->__START_TIME_FIELD__;
101
102        slot = min_t(size_t, div_u64(dur, 100 * NSEC_PER_MSEC), 99);
103        rwdf_100ms.increment(base + slot);
104        if (slot)
105                return 0;
106
107        slot = min_t(size_t, div_u64(dur, NSEC_PER_MSEC), 99);
108        rwdf_1ms.increment(base + slot);
109        if (slot)
110                return 0;
111
112        slot = min_t(size_t, div_u64(dur, 10 * NSEC_PER_USEC), 99);
113        rwdf_10us.increment(base + slot);
114        return 0;
115}
116"""
117
118args = parser.parse_args()
119args.pcts = args.pcts.split(',')
120args.pcts.sort(key=lambda x: float(x))
121
122try:
123    major = int(args.dev.split(':')[0])
124    minor = int(args.dev.split(':')[1])
125except Exception:
126    if '/' in args.dev:
127        stat = os.stat(args.dev)
128    else:
129        stat = os.stat('/dev/' + args.dev)
130
131    major = os.major(stat.st_rdev)
132    minor = os.minor(stat.st_rdev)
133
134if args.which == 'from-rq-alloc':
135    start_time_field = 'alloc_time_ns'
136elif args.which == 'after-rq-alloc':
137    start_time_field = 'start_time_ns'
138elif args.which == 'on-device':
139    start_time_field = 'io_start_time_ns'
140else:
141    print("Invalid latency measurement {}".format(args.which))
142    exit()
143
144bpf_source = bpf_source.replace('__START_TIME_FIELD__', start_time_field)
145bpf_source = bpf_source.replace('__MAJOR__', str(major))
146bpf_source = bpf_source.replace('__MINOR__', str(minor))
147
148if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1:
149    bpf_source = bpf_source.replace('__RQ_DISK__', 'rq_disk')
150else:
151    bpf_source = bpf_source.replace('__RQ_DISK__', 'q->disk')
152
153bpf = BPF(text=bpf_source)
154
155# times are in usecs
156MSEC = 1000
157SEC = 1000 * 1000
158
159cur_rwdf_100ms = bpf["rwdf_100ms"]
160cur_rwdf_1ms = bpf["rwdf_1ms"]
161cur_rwdf_10us = bpf["rwdf_10us"]
162
163last_rwdf_100ms = [0] * 400
164last_rwdf_1ms = [0] * 400
165last_rwdf_10us = [0] * 400
166
167rwdf_100ms = [0] * 400
168rwdf_1ms = [0] * 400
169rwdf_10us = [0] * 400
170
171io_type = ["read", "write", "discard", "flush"]
172
173def find_pct(req, total, slots, idx, counted):
174    while idx > 0:
175        idx -= 1
176        if slots[idx] > 0:
177            counted += slots[idx]
178            if args.verbose > 1:
179                print('idx={} counted={} pct={:.1f} req={}'
180                      .format(idx, counted, counted / total, req))
181            if (counted / total) * 100 >= 100 - req:
182                break
183    return (idx, counted)
184
185def calc_lat_pct(req_pcts, total, lat_100ms, lat_1ms, lat_10us):
186    pcts = [0] * len(req_pcts)
187
188    if total == 0:
189        return pcts
190
191    data = [(100 * MSEC, lat_100ms), (MSEC, lat_1ms), (10, lat_10us)]
192    data_sel = 0
193    idx = 100
194    counted = 0
195
196    for pct_idx in reversed(range(len(req_pcts))):
197        req = float(req_pcts[pct_idx])
198        while True:
199            last_counted = counted
200            (gran, slots) = data[data_sel]
201            (idx, counted) = find_pct(req, total, slots, idx, counted)
202            if args.verbose > 1:
203                print('pct_idx={} req={} gran={} idx={} counted={} total={}'
204                      .format(pct_idx, req, gran, idx, counted, total))
205            if idx > 0 or data_sel == len(data) - 1:
206                break
207            counted = last_counted
208            data_sel += 1
209            idx = 100
210
211        pcts[pct_idx] = gran * idx + gran / 2
212
213    return pcts
214
215def format_usec(lat):
216    if lat > SEC:
217        return '{:.1f}s'.format(lat / SEC)
218    elif lat > 10 * MSEC:
219        return '{:.0f}ms'.format(lat / MSEC)
220    elif lat > MSEC:
221        return '{:.1f}ms'.format(lat / MSEC)
222    elif lat > 0:
223        return '{:.0f}us'.format(lat)
224    else:
225        return '-'
226
227# 0 interval can be used to test whether this script would run successfully.
228if args.interval == 0:
229    sys.exit(0)
230
231# Set up signal handling so that we print the result on USR1/2 and before
232# exiting on a signal. Combined with infinite interval, this can be used to
233# obtain overall latency distribution between two events. On USR2 the
234# accumulated counters are cleared too, which can be used to define
235# arbitrary intervals.
236force_update_last_rwdf = False
237keep_running = True
238result_req = Event()
239def sig_handler(sig, frame):
240    global keep_running, force_update_last_rwdf, result_req
241    if sig == signal.SIGUSR1:
242        force_update_last_rwdf = True
243    elif sig != signal.SIGUSR2:
244        keep_running = False
245    result_req.set()
246
247for sig in (signal.SIGUSR1, signal.SIGUSR2, signal.SIGINT, signal.SIGTERM, signal.SIGHUP):
248    signal.signal(sig, sig_handler)
249
250# If infinite interval, always trigger the first output so that the caller
251# can tell when initialization is complete.
252if args.interval < 0:
253    result_req.set();
254
255while keep_running:
256    result_req.wait(args.interval if args.interval > 0 else None)
257    result_req.clear()
258
259    update_last_rwdf = args.interval > 0 or force_update_last_rwdf
260    force_update_last_rwdf = False
261    rwdf_total = [0] * 4;
262
263    for i in range(400):
264        v = cur_rwdf_100ms.sum(i).value
265        rwdf_100ms[i] = max(v - last_rwdf_100ms[i], 0)
266        if update_last_rwdf:
267            last_rwdf_100ms[i] = v
268
269        v = cur_rwdf_1ms.sum(i).value
270        rwdf_1ms[i] = max(v - last_rwdf_1ms[i], 0)
271        if update_last_rwdf:
272            last_rwdf_1ms[i] = v
273
274        v = cur_rwdf_10us.sum(i).value
275        rwdf_10us[i] = max(v - last_rwdf_10us[i], 0)
276        if update_last_rwdf:
277            last_rwdf_10us[i] = v
278
279        rwdf_total[int(i / 100)] += rwdf_100ms[i]
280
281    rwdf_lat = []
282    for i in range(4):
283        left = i * 100
284        right = left + 100
285        rwdf_lat.append(
286            calc_lat_pct(args.pcts, rwdf_total[i],
287                         rwdf_100ms[left:right],
288                         rwdf_1ms[left:right],
289                         rwdf_10us[left:right]))
290
291        if args.verbose:
292            print('{:7} 100ms {}'.format(io_type[i], rwdf_100ms[left:right]))
293            print('{:7}   1ms {}'.format(io_type[i], rwdf_1ms[left:right]))
294            print('{:7}  10us {}'.format(io_type[i], rwdf_10us[left:right]))
295
296    if args.json:
297        result = {}
298        for iot in range(4):
299            lats = {}
300            for pi in range(len(args.pcts)):
301                lats[args.pcts[pi]] = rwdf_lat[iot][pi] / SEC
302            result[io_type[iot]] = lats
303        print(json.dumps(result), flush=True)
304    else:
305        print('\n{:<7}'.format(os.path.basename(args.dev)), end='')
306        widths = []
307        for pct in args.pcts:
308            widths.append(max(len(pct), 5))
309            print(' {:>5}'.format(pct), end='')
310        print()
311        for iot in range(4):
312            print('{:7}'.format(io_type[iot]), end='')
313            for pi in range(len(rwdf_lat[iot])):
314                print(' {:>{}}'.format(format_usec(rwdf_lat[iot][pi]), widths[pi]), end='')
315            print()
316