xref: /aosp_15_r20/external/bcc/tools/nfsslower.py (revision 387f9dfdfa2baef462e92476d413c7bc2470293e)
1#!/usr/bin/env python
2# @lint-avoid-python-3-compatibility-imports
3#
4# nfsslower     Trace slow NFS operations
5#               for Linux using BCC & eBPF
6#
7# Usage: nfsslower [-h] [-p PID] [min_ms]
8#
9# This script traces some common NFS operations: read, write, opens and
10# getattr. It measures the time spent in these operations, and prints details
11# for each that exceeded a threshold.
12# The script also traces commit operations, which is specific to nfs and could
13# be pretty slow.
14#
15# WARNING: This adds low-overhead instrumentation to these NFS operations,
16# including reads and writes from the file system cache. Such reads and writes
17# can be very frequent (depending on the workload; eg, 1M/sec), at which
18# point the overhead of this tool (even if it prints no "slower" events) can
19# begin to become significant.
20#
21# Most of this code is copied from similar tools (ext4slower, zfsslower etc)
22#
23# By default, a minimum millisecond threshold of 10 is used.
24#
25# This tool uses kprobes to instrument the kernel for entry and exit
26# information, in the future a preferred way would be to use tracepoints.
27# Currently there aren't any tracepoints available for nfs_read_file,
28# nfs_write_file and nfs_open_file, nfs_getattr does have entry and exit
29# tracepoints but we chose to use kprobes for consistency
30# Raw tracepoints are used to trace nfs:nfs_initiate_commit and
31# nfs:nfs_commit_done.
32#
33# 31-Aug-2017   Samuel Nair created this. Should work with NFSv{3,4}
34
35from __future__ import print_function
36from bcc import BPF
37import argparse
38from time import strftime
39
40examples = """
41    ./nfsslower         # trace operations slower than 10ms
42    ./nfsslower 1       # trace operations slower than 1ms
43    ./nfsslower -j 1    # ... 1 ms, parsable output (csv)
44    ./nfsslower 0       # trace all nfs operations
45    ./nfsslower -p 121  # trace pid 121 only
46"""
47parser = argparse.ArgumentParser(
48    description="""Trace READ, WRITE, OPEN, GETATTR \
49and COMMIT NFS calls slower than a threshold,\
50supports NFSv{3,4}""",
51    formatter_class=argparse.RawDescriptionHelpFormatter,
52    epilog=examples)
53
54parser.add_argument("-j", "--csv", action="store_true",
55                    help="just print fields: comma-separated values")
56parser.add_argument("-p", "--pid", help="Trace this pid only")
57parser.add_argument("min_ms", nargs="?", default='10',
58                    help="Minimum IO duration to trace in ms (default=10ms)")
59parser.add_argument("--ebpf", action="store_true",
60                    help=argparse.SUPPRESS)
61args = parser.parse_args()
62min_ms = int(args.min_ms)
63pid = args.pid
64csv = args.csv
65debug = 0
66
67bpf_text = """
68
69#include <uapi/linux/ptrace.h>
70#include <linux/fs.h>
71#include <linux/sched.h>
72#include <linux/dcache.h>
73#include <linux/nfs_fs.h>
74
75#define TRACE_READ 0
76#define TRACE_WRITE 1
77#define TRACE_OPEN 2
78#define TRACE_GETATTR 3
79#define TRACE_COMMIT 4
80
81struct val_t {
82    u64 ts;
83    u64 offset;
84    struct file *fp;
85    struct dentry *d;
86};
87
88struct commit_t {
89    u64 ts;
90    u64 offset;
91    u64 count;
92};
93
94struct data_t {
95    // XXX: switch some to u32's when supported
96    u64 ts_us;
97    u64 type;
98    u64 size;
99    u64 offset;
100    u64 delta_us;
101    u32 pid;
102    char task[TASK_COMM_LEN];
103    char file[DNAME_INLINE_LEN];
104};
105
106BPF_HASH(entryinfo, u64, struct val_t);
107BPF_PERF_OUTPUT(events);
108BPF_HASH(commitinfo, u64, struct commit_t);
109
110int trace_rw_entry(struct pt_regs *ctx, struct kiocb *iocb,
111                                struct iov_iter *data)
112{
113    u64 id = bpf_get_current_pid_tgid();
114    u32 pid = id >> 32; // PID is higher part
115
116    if(FILTER_PID)
117        return 0;
118
119    // store filep and timestamp by id
120    struct val_t val = {};
121    val.ts = bpf_ktime_get_ns();
122    val.fp = iocb->ki_filp;
123    val.d = NULL;
124    val.offset = iocb->ki_pos;
125
126    if (val.fp)
127        entryinfo.update(&id, &val);
128
129    return 0;
130}
131
132int trace_file_open_entry (struct pt_regs *ctx, struct inode *inode,
133                                struct file *filp)
134{
135    u64 id = bpf_get_current_pid_tgid();
136    u32 pid = id >> 32; // PID is higher part
137
138    if(FILTER_PID)
139        return 0;
140
141    // store filep and timestamp by id
142    struct val_t val = {};
143    val.ts = bpf_ktime_get_ns();
144    val.fp = filp;
145    val.d = NULL;
146    val.offset = 0;
147    if (val.fp)
148        entryinfo.update(&id, &val);
149
150    return 0;
151}
152
153int trace_getattr_entry(struct pt_regs *ctx, struct vfsmount *mnt,
154                        struct dentry *dentry, struct kstat *stat)
155{
156    u64 id = bpf_get_current_pid_tgid();
157    u32 pid = id >> 32; // PID is higher part
158
159    if(FILTER_PID)
160        return 0;
161
162    struct val_t val = {};
163    val.ts = bpf_ktime_get_ns();
164    val.fp = NULL;
165    val.d = dentry;
166    val.offset = 0;
167    if (val.d)
168        entryinfo.update(&id, &val);
169
170    return 0;
171}
172
173static int trace_exit(struct pt_regs *ctx, int type)
174{
175    struct val_t *valp;
176    u64 id = bpf_get_current_pid_tgid();
177    u32 pid = id >> 32; // PID is higher part
178
179    valp = entryinfo.lookup(&id);
180    if (valp == 0) {
181        // missed tracing issue or filtered
182        return 0;
183    }
184
185    // calculate delta
186    u64 ts = bpf_ktime_get_ns();
187    u64 delta_us = (ts - valp->ts) / 1000;
188    entryinfo.delete(&id);
189
190    if (FILTER_US)
191        return 0;
192
193    // populate output struct
194    u32 size = PT_REGS_RC(ctx);
195    struct data_t data = {};
196    data.type = type;
197    data.size = size;
198    data.delta_us = delta_us;
199    data.pid = pid;
200    data.ts_us = ts / 1000;
201    data.offset = valp->offset;
202    bpf_get_current_comm(&data.task, sizeof(data.task));
203
204    // workaround (rewriter should handle file to d_name in one step):
205    struct dentry *de = NULL;
206    struct qstr qs = {};
207    if(type == TRACE_GETATTR)
208    {
209        bpf_probe_read_kernel(&de,sizeof(de), &valp->d);
210    }
211    else
212    {
213        bpf_probe_read_kernel(&de, sizeof(de), &valp->fp->f_path.dentry);
214    }
215
216    bpf_probe_read_kernel(&qs, sizeof(qs), (void *)&de->d_name);
217    if (qs.len == 0)
218        return 0;
219
220    bpf_probe_read_kernel(&data.file, sizeof(data.file), (void *)qs.name);
221    // output
222    events.perf_submit(ctx, &data, sizeof(data));
223    return 0;
224}
225
226int trace_file_open_return(struct pt_regs *ctx)
227{
228    return trace_exit(ctx, TRACE_OPEN);
229}
230
231int trace_read_return(struct pt_regs *ctx)
232{
233    return trace_exit(ctx, TRACE_READ);
234}
235
236int trace_write_return(struct pt_regs *ctx)
237{
238    return trace_exit(ctx, TRACE_WRITE);
239}
240
241int trace_getattr_return(struct pt_regs *ctx)
242{
243    return trace_exit(ctx, TRACE_GETATTR);
244}
245
246static int trace_initiate_commit(struct nfs_commit_data *cd)
247{
248    u64 key = (u64)cd;
249    struct commit_t c = { 0 };
250
251    c.ts = bpf_ktime_get_ns();
252    bpf_probe_read_kernel(&c.offset, sizeof(cd->args.offset),
253        &cd->args.offset);
254    bpf_probe_read_kernel(&c.count, sizeof(cd->args.count), &cd->args.count);
255    commitinfo.update(&key, &c);
256    return 0;
257}
258
259"""
260
261bpf_text_raw_tp = """
262RAW_TRACEPOINT_PROBE(nfs_initiate_commit)
263{
264    // TP_PROTO(const struct nfs_commit_data *data)
265    struct nfs_commit_data *cd = (struct nfs_commit_data *)ctx->args[0];
266    return trace_initiate_commit(cd);
267}
268
269RAW_TRACEPOINT_PROBE(nfs_commit_done)
270{
271    // TP_PROTO(const struct rpc_task *task, const struct nfs_commit_data *data)
272    struct nfs_commit_data *cd = (struct nfs_commit_data *)ctx->args[1];
273    u64 key = (u64)cd;
274    struct commit_t *cp = commitinfo.lookup(&key);
275
276    if (cp) {
277        struct nfs_open_context *p;
278        struct dentry *de;
279        struct qstr qs;
280        u64 ts = bpf_ktime_get_ns();
281        u64 delta_us = (ts - cp->ts) / 1000;
282        u32 pid = bpf_get_current_pid_tgid() >> 32;
283
284        struct data_t data = {};
285        data.type = TRACE_COMMIT;
286        data.offset = cp->offset;
287        data.size = cp->count;
288        data.ts_us = ts/1000;
289        data.delta_us = delta_us;
290        data.pid = pid;
291
292        commitinfo.delete(&key);
293        bpf_get_current_comm(&data.task, sizeof(data.task));
294
295        if(FILTER_PID)
296            return 0;
297
298        if (FILTER_US)
299            return 0;
300
301        bpf_probe_read_kernel(&p, sizeof(p), &cd->context);
302        bpf_probe_read_kernel(&de, sizeof(de), &p->dentry);
303        bpf_probe_read_kernel(&qs, sizeof(qs), &de->d_name);
304        if (qs.len) {
305            bpf_probe_read_kernel(&data.file, sizeof(data.file),
306                (void *)qs.name);
307            events.perf_submit(ctx, &data, sizeof(data));
308        }
309    }
310    return 0;
311}
312"""
313
314bpf_text_kprobe = """
315int trace_nfs_initiate_commit(struct pt_regs *ctx, void *clnt, struct nfs_commit_data *cd)
316{
317    return trace_initiate_commit(cd);
318}
319
320int trace_nfs_commit_done(struct pt_regs *ctx, void *task, void *calldata)
321{
322    struct nfs_commit_data *cd = (struct nfs_commit_data *)calldata;
323    u64 key = (u64)cd;
324    struct commit_t *cp = commitinfo.lookup(&key);
325
326    if (cp) {
327        struct nfs_open_context *p;
328        struct dentry *de;
329        struct qstr qs;
330        u64 ts = bpf_ktime_get_ns();
331        u64 delta_us = (ts - cp->ts) / 1000;
332        u32 pid = bpf_get_current_pid_tgid() >> 32;
333
334        struct data_t data = {};
335        data.type = TRACE_COMMIT;
336        data.offset = cp->offset;
337        data.size = cp->count;
338        data.ts_us = ts/1000;
339        data.delta_us = delta_us;
340        data.pid = pid;
341
342        commitinfo.delete(&key);
343        bpf_get_current_comm(&data.task, sizeof(data.task));
344
345        if(FILTER_PID)
346            return 0;
347
348        if (FILTER_US)
349            return 0;
350
351        bpf_probe_read_kernel(&p, sizeof(p), &cd->context);
352        bpf_probe_read_kernel(&de, sizeof(de), &p->dentry);
353        bpf_probe_read_kernel(&qs, sizeof(qs), &de->d_name);
354        if (qs.len) {
355            bpf_probe_read_kernel(&data.file, sizeof(data.file),
356                (void *)qs.name);
357            events.perf_submit(ctx, &data, sizeof(data));
358        }
359    }
360    return 0;
361}
362"""
363is_support_raw_tp = BPF.support_raw_tracepoint()
364if is_support_raw_tp:
365    bpf_text += bpf_text_raw_tp
366else:
367    bpf_text += bpf_text_kprobe
368
369if min_ms == 0:
370    bpf_text = bpf_text.replace('FILTER_US', '0')
371else:
372    bpf_text = bpf_text.replace('FILTER_US',
373                                'delta_us <= %s' % str(min_ms * 1000))
374if args.pid:
375    bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid)
376else:
377    bpf_text = bpf_text.replace('FILTER_PID', '0')
378if debug or args.ebpf:
379    print(bpf_text)
380    if args.ebpf:
381        exit()
382
383# process event
384def print_event(cpu, data, size):
385    event = b["events"].event(data)
386
387    type = 'R'
388    if event.type == 1:
389        type = 'W'
390    elif event.type == 2:
391        type = 'O'
392    elif event.type == 3:
393        type = 'G'
394    elif event.type == 4:
395        type = 'C'
396
397    if(csv):
398        print("%d,%s,%d,%s,%d,%d,%d,%s" % (
399            event.ts_us, event.task, event.pid, type, event.size,
400            event.offset, event.delta_us, event.file))
401        return
402    print("%-8s %-14.14s %-6s %1s %-7s %-8d %7.2f %s" %
403          (strftime("%H:%M:%S"),
404           event.task.decode('utf-8', 'replace'),
405           event.pid,
406           type,
407           event.size,
408           event.offset / 1024,
409           float(event.delta_us) / 1000,
410           event.file.decode('utf-8', 'replace')))
411
412
413# Currently specifically works for NFSv4, the other kprobes are generic
414# so it should work with earlier NFS versions
415
416# The following warning is shown on kernels after linux-5.18 when using bcc.
417# Add compile option to silence it.
418#   In file included from /virtual/main.c:7:
419#   In file included from include/linux/nfs_fs.h:31:
420#   In file included from include/linux/sunrpc/auth.h:13:
421#   In file included from include/linux/sunrpc/sched.h:19:
422#   include/linux/sunrpc/xdr.h:751:10: warning: result of comparison of constant 4611686018427387903 with expression of type '__u32' (aka 'unsigned int') is always false [-Wtautological-constant-out-of-range-compare]
423#           if (len > SIZE_MAX / sizeof(*p))
424#               ~~~ ^ ~~~~~~~~~~~~~~~~~~~~~
425#   1 warning generated.
426b = BPF(text=bpf_text,
427    cflags=["-Wno-tautological-constant-out-of-range-compare"])
428b.attach_kprobe(event="nfs_file_read", fn_name="trace_rw_entry")
429b.attach_kprobe(event="nfs_file_write", fn_name="trace_rw_entry")
430b.attach_kprobe(event="nfs_file_open", fn_name="trace_file_open_entry")
431b.attach_kprobe(event="nfs_getattr", fn_name="trace_getattr_entry")
432
433b.attach_kretprobe(event="nfs_file_read", fn_name="trace_read_return")
434b.attach_kretprobe(event="nfs_file_write", fn_name="trace_write_return")
435b.attach_kretprobe(event="nfs_file_open", fn_name="trace_file_open_return")
436b.attach_kretprobe(event="nfs_getattr", fn_name="trace_getattr_return")
437
438if BPF.get_kprobe_functions(b'nfs4_file_open'):
439    b.attach_kprobe(event="nfs4_file_open", fn_name="trace_file_open_entry")
440    b.attach_kretprobe(event="nfs4_file_open", fn_name="trace_file_open_return")
441
442if not is_support_raw_tp:
443    b.attach_kprobe(event="nfs_initiate_commit",
444                    fn_name="trace_nfs_initiate_commit")
445    b.attach_kprobe(event="nfs_commit_done",
446                    fn_name="trace_nfs_commit_done")
447
448if(csv):
449    print("ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE")
450else:
451    if min_ms == 0:
452        print("Tracing NFS operations... Ctrl-C to quit")
453    else:
454        print("""Tracing NFS operations that are slower than \
455%d ms... Ctrl-C to quit"""
456              % min_ms)
457    print("%-8s %-14s %-6s %1s %-7s %-8s %7s %s" % ("TIME",
458                                                    "COMM",
459                                                    "PID",
460                                                    "T",
461                                                    "BYTES",
462                                                    "OFF_KB",
463                                                    "LAT(ms)",
464                                                    "FILENAME"))
465
466b["events"].open_perf_buffer(print_event, page_cnt=64)
467while 1:
468    try:
469        b.perf_buffer_poll()
470    except KeyboardInterrupt:
471        exit()
472