1#!/usr/bin/env python 2# @lint-avoid-python-3-compatibility-imports 3# 4# nfsslower Trace slow NFS operations 5# for Linux using BCC & eBPF 6# 7# Usage: nfsslower [-h] [-p PID] [min_ms] 8# 9# This script traces some common NFS operations: read, write, opens and 10# getattr. It measures the time spent in these operations, and prints details 11# for each that exceeded a threshold. 12# The script also traces commit operations, which is specific to nfs and could 13# be pretty slow. 14# 15# WARNING: This adds low-overhead instrumentation to these NFS operations, 16# including reads and writes from the file system cache. Such reads and writes 17# can be very frequent (depending on the workload; eg, 1M/sec), at which 18# point the overhead of this tool (even if it prints no "slower" events) can 19# begin to become significant. 20# 21# Most of this code is copied from similar tools (ext4slower, zfsslower etc) 22# 23# By default, a minimum millisecond threshold of 10 is used. 24# 25# This tool uses kprobes to instrument the kernel for entry and exit 26# information, in the future a preferred way would be to use tracepoints. 27# Currently there aren't any tracepoints available for nfs_read_file, 28# nfs_write_file and nfs_open_file, nfs_getattr does have entry and exit 29# tracepoints but we chose to use kprobes for consistency 30# Raw tracepoints are used to trace nfs:nfs_initiate_commit and 31# nfs:nfs_commit_done. 32# 33# 31-Aug-2017 Samuel Nair created this. Should work with NFSv{3,4} 34 35from __future__ import print_function 36from bcc import BPF 37import argparse 38from time import strftime 39 40examples = """ 41 ./nfsslower # trace operations slower than 10ms 42 ./nfsslower 1 # trace operations slower than 1ms 43 ./nfsslower -j 1 # ... 1 ms, parsable output (csv) 44 ./nfsslower 0 # trace all nfs operations 45 ./nfsslower -p 121 # trace pid 121 only 46""" 47parser = argparse.ArgumentParser( 48 description="""Trace READ, WRITE, OPEN, GETATTR \ 49and COMMIT NFS calls slower than a threshold,\ 50supports NFSv{3,4}""", 51 formatter_class=argparse.RawDescriptionHelpFormatter, 52 epilog=examples) 53 54parser.add_argument("-j", "--csv", action="store_true", 55 help="just print fields: comma-separated values") 56parser.add_argument("-p", "--pid", help="Trace this pid only") 57parser.add_argument("min_ms", nargs="?", default='10', 58 help="Minimum IO duration to trace in ms (default=10ms)") 59parser.add_argument("--ebpf", action="store_true", 60 help=argparse.SUPPRESS) 61args = parser.parse_args() 62min_ms = int(args.min_ms) 63pid = args.pid 64csv = args.csv 65debug = 0 66 67bpf_text = """ 68 69#include <uapi/linux/ptrace.h> 70#include <linux/fs.h> 71#include <linux/sched.h> 72#include <linux/dcache.h> 73#include <linux/nfs_fs.h> 74 75#define TRACE_READ 0 76#define TRACE_WRITE 1 77#define TRACE_OPEN 2 78#define TRACE_GETATTR 3 79#define TRACE_COMMIT 4 80 81struct val_t { 82 u64 ts; 83 u64 offset; 84 struct file *fp; 85 struct dentry *d; 86}; 87 88struct commit_t { 89 u64 ts; 90 u64 offset; 91 u64 count; 92}; 93 94struct data_t { 95 // XXX: switch some to u32's when supported 96 u64 ts_us; 97 u64 type; 98 u64 size; 99 u64 offset; 100 u64 delta_us; 101 u32 pid; 102 char task[TASK_COMM_LEN]; 103 char file[DNAME_INLINE_LEN]; 104}; 105 106BPF_HASH(entryinfo, u64, struct val_t); 107BPF_PERF_OUTPUT(events); 108BPF_HASH(commitinfo, u64, struct commit_t); 109 110int trace_rw_entry(struct pt_regs *ctx, struct kiocb *iocb, 111 struct iov_iter *data) 112{ 113 u64 id = bpf_get_current_pid_tgid(); 114 u32 pid = id >> 32; // PID is higher part 115 116 if(FILTER_PID) 117 return 0; 118 119 // store filep and timestamp by id 120 struct val_t val = {}; 121 val.ts = bpf_ktime_get_ns(); 122 val.fp = iocb->ki_filp; 123 val.d = NULL; 124 val.offset = iocb->ki_pos; 125 126 if (val.fp) 127 entryinfo.update(&id, &val); 128 129 return 0; 130} 131 132int trace_file_open_entry (struct pt_regs *ctx, struct inode *inode, 133 struct file *filp) 134{ 135 u64 id = bpf_get_current_pid_tgid(); 136 u32 pid = id >> 32; // PID is higher part 137 138 if(FILTER_PID) 139 return 0; 140 141 // store filep and timestamp by id 142 struct val_t val = {}; 143 val.ts = bpf_ktime_get_ns(); 144 val.fp = filp; 145 val.d = NULL; 146 val.offset = 0; 147 if (val.fp) 148 entryinfo.update(&id, &val); 149 150 return 0; 151} 152 153int trace_getattr_entry(struct pt_regs *ctx, struct vfsmount *mnt, 154 struct dentry *dentry, struct kstat *stat) 155{ 156 u64 id = bpf_get_current_pid_tgid(); 157 u32 pid = id >> 32; // PID is higher part 158 159 if(FILTER_PID) 160 return 0; 161 162 struct val_t val = {}; 163 val.ts = bpf_ktime_get_ns(); 164 val.fp = NULL; 165 val.d = dentry; 166 val.offset = 0; 167 if (val.d) 168 entryinfo.update(&id, &val); 169 170 return 0; 171} 172 173static int trace_exit(struct pt_regs *ctx, int type) 174{ 175 struct val_t *valp; 176 u64 id = bpf_get_current_pid_tgid(); 177 u32 pid = id >> 32; // PID is higher part 178 179 valp = entryinfo.lookup(&id); 180 if (valp == 0) { 181 // missed tracing issue or filtered 182 return 0; 183 } 184 185 // calculate delta 186 u64 ts = bpf_ktime_get_ns(); 187 u64 delta_us = (ts - valp->ts) / 1000; 188 entryinfo.delete(&id); 189 190 if (FILTER_US) 191 return 0; 192 193 // populate output struct 194 u32 size = PT_REGS_RC(ctx); 195 struct data_t data = {}; 196 data.type = type; 197 data.size = size; 198 data.delta_us = delta_us; 199 data.pid = pid; 200 data.ts_us = ts / 1000; 201 data.offset = valp->offset; 202 bpf_get_current_comm(&data.task, sizeof(data.task)); 203 204 // workaround (rewriter should handle file to d_name in one step): 205 struct dentry *de = NULL; 206 struct qstr qs = {}; 207 if(type == TRACE_GETATTR) 208 { 209 bpf_probe_read_kernel(&de,sizeof(de), &valp->d); 210 } 211 else 212 { 213 bpf_probe_read_kernel(&de, sizeof(de), &valp->fp->f_path.dentry); 214 } 215 216 bpf_probe_read_kernel(&qs, sizeof(qs), (void *)&de->d_name); 217 if (qs.len == 0) 218 return 0; 219 220 bpf_probe_read_kernel(&data.file, sizeof(data.file), (void *)qs.name); 221 // output 222 events.perf_submit(ctx, &data, sizeof(data)); 223 return 0; 224} 225 226int trace_file_open_return(struct pt_regs *ctx) 227{ 228 return trace_exit(ctx, TRACE_OPEN); 229} 230 231int trace_read_return(struct pt_regs *ctx) 232{ 233 return trace_exit(ctx, TRACE_READ); 234} 235 236int trace_write_return(struct pt_regs *ctx) 237{ 238 return trace_exit(ctx, TRACE_WRITE); 239} 240 241int trace_getattr_return(struct pt_regs *ctx) 242{ 243 return trace_exit(ctx, TRACE_GETATTR); 244} 245 246static int trace_initiate_commit(struct nfs_commit_data *cd) 247{ 248 u64 key = (u64)cd; 249 struct commit_t c = { 0 }; 250 251 c.ts = bpf_ktime_get_ns(); 252 bpf_probe_read_kernel(&c.offset, sizeof(cd->args.offset), 253 &cd->args.offset); 254 bpf_probe_read_kernel(&c.count, sizeof(cd->args.count), &cd->args.count); 255 commitinfo.update(&key, &c); 256 return 0; 257} 258 259""" 260 261bpf_text_raw_tp = """ 262RAW_TRACEPOINT_PROBE(nfs_initiate_commit) 263{ 264 // TP_PROTO(const struct nfs_commit_data *data) 265 struct nfs_commit_data *cd = (struct nfs_commit_data *)ctx->args[0]; 266 return trace_initiate_commit(cd); 267} 268 269RAW_TRACEPOINT_PROBE(nfs_commit_done) 270{ 271 // TP_PROTO(const struct rpc_task *task, const struct nfs_commit_data *data) 272 struct nfs_commit_data *cd = (struct nfs_commit_data *)ctx->args[1]; 273 u64 key = (u64)cd; 274 struct commit_t *cp = commitinfo.lookup(&key); 275 276 if (cp) { 277 struct nfs_open_context *p; 278 struct dentry *de; 279 struct qstr qs; 280 u64 ts = bpf_ktime_get_ns(); 281 u64 delta_us = (ts - cp->ts) / 1000; 282 u32 pid = bpf_get_current_pid_tgid() >> 32; 283 284 struct data_t data = {}; 285 data.type = TRACE_COMMIT; 286 data.offset = cp->offset; 287 data.size = cp->count; 288 data.ts_us = ts/1000; 289 data.delta_us = delta_us; 290 data.pid = pid; 291 292 commitinfo.delete(&key); 293 bpf_get_current_comm(&data.task, sizeof(data.task)); 294 295 if(FILTER_PID) 296 return 0; 297 298 if (FILTER_US) 299 return 0; 300 301 bpf_probe_read_kernel(&p, sizeof(p), &cd->context); 302 bpf_probe_read_kernel(&de, sizeof(de), &p->dentry); 303 bpf_probe_read_kernel(&qs, sizeof(qs), &de->d_name); 304 if (qs.len) { 305 bpf_probe_read_kernel(&data.file, sizeof(data.file), 306 (void *)qs.name); 307 events.perf_submit(ctx, &data, sizeof(data)); 308 } 309 } 310 return 0; 311} 312""" 313 314bpf_text_kprobe = """ 315int trace_nfs_initiate_commit(struct pt_regs *ctx, void *clnt, struct nfs_commit_data *cd) 316{ 317 return trace_initiate_commit(cd); 318} 319 320int trace_nfs_commit_done(struct pt_regs *ctx, void *task, void *calldata) 321{ 322 struct nfs_commit_data *cd = (struct nfs_commit_data *)calldata; 323 u64 key = (u64)cd; 324 struct commit_t *cp = commitinfo.lookup(&key); 325 326 if (cp) { 327 struct nfs_open_context *p; 328 struct dentry *de; 329 struct qstr qs; 330 u64 ts = bpf_ktime_get_ns(); 331 u64 delta_us = (ts - cp->ts) / 1000; 332 u32 pid = bpf_get_current_pid_tgid() >> 32; 333 334 struct data_t data = {}; 335 data.type = TRACE_COMMIT; 336 data.offset = cp->offset; 337 data.size = cp->count; 338 data.ts_us = ts/1000; 339 data.delta_us = delta_us; 340 data.pid = pid; 341 342 commitinfo.delete(&key); 343 bpf_get_current_comm(&data.task, sizeof(data.task)); 344 345 if(FILTER_PID) 346 return 0; 347 348 if (FILTER_US) 349 return 0; 350 351 bpf_probe_read_kernel(&p, sizeof(p), &cd->context); 352 bpf_probe_read_kernel(&de, sizeof(de), &p->dentry); 353 bpf_probe_read_kernel(&qs, sizeof(qs), &de->d_name); 354 if (qs.len) { 355 bpf_probe_read_kernel(&data.file, sizeof(data.file), 356 (void *)qs.name); 357 events.perf_submit(ctx, &data, sizeof(data)); 358 } 359 } 360 return 0; 361} 362""" 363is_support_raw_tp = BPF.support_raw_tracepoint() 364if is_support_raw_tp: 365 bpf_text += bpf_text_raw_tp 366else: 367 bpf_text += bpf_text_kprobe 368 369if min_ms == 0: 370 bpf_text = bpf_text.replace('FILTER_US', '0') 371else: 372 bpf_text = bpf_text.replace('FILTER_US', 373 'delta_us <= %s' % str(min_ms * 1000)) 374if args.pid: 375 bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid) 376else: 377 bpf_text = bpf_text.replace('FILTER_PID', '0') 378if debug or args.ebpf: 379 print(bpf_text) 380 if args.ebpf: 381 exit() 382 383# process event 384def print_event(cpu, data, size): 385 event = b["events"].event(data) 386 387 type = 'R' 388 if event.type == 1: 389 type = 'W' 390 elif event.type == 2: 391 type = 'O' 392 elif event.type == 3: 393 type = 'G' 394 elif event.type == 4: 395 type = 'C' 396 397 if(csv): 398 print("%d,%s,%d,%s,%d,%d,%d,%s" % ( 399 event.ts_us, event.task, event.pid, type, event.size, 400 event.offset, event.delta_us, event.file)) 401 return 402 print("%-8s %-14.14s %-6s %1s %-7s %-8d %7.2f %s" % 403 (strftime("%H:%M:%S"), 404 event.task.decode('utf-8', 'replace'), 405 event.pid, 406 type, 407 event.size, 408 event.offset / 1024, 409 float(event.delta_us) / 1000, 410 event.file.decode('utf-8', 'replace'))) 411 412 413# Currently specifically works for NFSv4, the other kprobes are generic 414# so it should work with earlier NFS versions 415 416# The following warning is shown on kernels after linux-5.18 when using bcc. 417# Add compile option to silence it. 418# In file included from /virtual/main.c:7: 419# In file included from include/linux/nfs_fs.h:31: 420# In file included from include/linux/sunrpc/auth.h:13: 421# In file included from include/linux/sunrpc/sched.h:19: 422# include/linux/sunrpc/xdr.h:751:10: warning: result of comparison of constant 4611686018427387903 with expression of type '__u32' (aka 'unsigned int') is always false [-Wtautological-constant-out-of-range-compare] 423# if (len > SIZE_MAX / sizeof(*p)) 424# ~~~ ^ ~~~~~~~~~~~~~~~~~~~~~ 425# 1 warning generated. 426b = BPF(text=bpf_text, 427 cflags=["-Wno-tautological-constant-out-of-range-compare"]) 428b.attach_kprobe(event="nfs_file_read", fn_name="trace_rw_entry") 429b.attach_kprobe(event="nfs_file_write", fn_name="trace_rw_entry") 430b.attach_kprobe(event="nfs_file_open", fn_name="trace_file_open_entry") 431b.attach_kprobe(event="nfs_getattr", fn_name="trace_getattr_entry") 432 433b.attach_kretprobe(event="nfs_file_read", fn_name="trace_read_return") 434b.attach_kretprobe(event="nfs_file_write", fn_name="trace_write_return") 435b.attach_kretprobe(event="nfs_file_open", fn_name="trace_file_open_return") 436b.attach_kretprobe(event="nfs_getattr", fn_name="trace_getattr_return") 437 438if BPF.get_kprobe_functions(b'nfs4_file_open'): 439 b.attach_kprobe(event="nfs4_file_open", fn_name="trace_file_open_entry") 440 b.attach_kretprobe(event="nfs4_file_open", fn_name="trace_file_open_return") 441 442if not is_support_raw_tp: 443 b.attach_kprobe(event="nfs_initiate_commit", 444 fn_name="trace_nfs_initiate_commit") 445 b.attach_kprobe(event="nfs_commit_done", 446 fn_name="trace_nfs_commit_done") 447 448if(csv): 449 print("ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE") 450else: 451 if min_ms == 0: 452 print("Tracing NFS operations... Ctrl-C to quit") 453 else: 454 print("""Tracing NFS operations that are slower than \ 455%d ms... Ctrl-C to quit""" 456 % min_ms) 457 print("%-8s %-14s %-6s %1s %-7s %-8s %7s %s" % ("TIME", 458 "COMM", 459 "PID", 460 "T", 461 "BYTES", 462 "OFF_KB", 463 "LAT(ms)", 464 "FILENAME")) 465 466b["events"].open_perf_buffer(print_event, page_cnt=64) 467while 1: 468 try: 469 b.perf_buffer_poll() 470 except KeyboardInterrupt: 471 exit() 472