1#!/usr/bin/env python 2# @lint-avoid-python-3-compatibility-imports 3# 4# xfsslower Trace slow XFS operations. 5# For Linux, uses BCC, eBPF. 6# 7# USAGE: xfsslower [-h] [-j] [-p PID] [min_ms] 8# 9# This script traces common XFS file operations: reads, writes, opens, and 10# syncs. It measures the time spent in these operations, and prints details 11# for each that exceeded a threshold. 12# 13# WARNING: This adds low-overhead instrumentation to these XFS operations, 14# including reads and writes from the file system cache. Such reads and writes 15# can be very frequent (depending on the workload; eg, 1M/sec), at which 16# point the overhead of this tool (even if it prints no "slower" events) can 17# begin to become significant. 18# 19# By default, a minimum millisecond threshold of 10 is used. 20# 21# Copyright 2016 Netflix, Inc. 22# Licensed under the Apache License, Version 2.0 (the "License") 23# 24# 11-Feb-2016 Brendan Gregg Created this. 25# 16-Oct-2016 Dina Goldshtein -p to filter by process ID. 26 27from __future__ import print_function 28from bcc import BPF 29import argparse 30from time import strftime 31 32# arguments 33examples = """examples: 34 ./xfsslower # trace operations slower than 10 ms (default) 35 ./xfsslower 1 # trace operations slower than 1 ms 36 ./xfsslower -j 1 # ... 1 ms, parsable output (csv) 37 ./xfsslower 0 # trace all operations (warning: verbose) 38 ./xfsslower -p 185 # trace PID 185 only 39""" 40parser = argparse.ArgumentParser( 41 description="Trace common XFS file operations slower than a threshold", 42 formatter_class=argparse.RawDescriptionHelpFormatter, 43 epilog=examples) 44parser.add_argument("-j", "--csv", action="store_true", 45 help="just print fields: comma-separated values") 46parser.add_argument("-p", "--pid", 47 help="trace this PID only") 48parser.add_argument("min_ms", nargs="?", default='10', 49 help="minimum I/O duration to trace, in ms (default 10)") 50parser.add_argument("--ebpf", action="store_true", 51 help=argparse.SUPPRESS) 52args = parser.parse_args() 53min_ms = int(args.min_ms) 54pid = args.pid 55csv = args.csv 56debug = 0 57 58# define BPF program 59bpf_text = """ 60#include <uapi/linux/ptrace.h> 61#include <linux/fs.h> 62#include <linux/sched.h> 63#include <linux/dcache.h> 64 65// XXX: switch these to char's when supported 66#define TRACE_READ 0 67#define TRACE_WRITE 1 68#define TRACE_OPEN 2 69#define TRACE_FSYNC 3 70 71struct val_t { 72 u64 ts; 73 u64 offset; 74 struct file *fp; 75}; 76 77struct data_t { 78 // XXX: switch some to u32's when supported 79 u64 ts_us; 80 u64 type; 81 u64 size; 82 u64 offset; 83 u64 delta_us; 84 u32 pid; 85 char task[TASK_COMM_LEN]; 86 char file[DNAME_INLINE_LEN]; 87}; 88 89BPF_HASH(entryinfo, u64, struct val_t); 90BPF_PERF_OUTPUT(events); 91 92// 93// Store timestamp and size on entry 94// 95 96// xfs_file_read_iter(), xfs_file_write_iter(): 97int trace_rw_entry(struct pt_regs *ctx, struct kiocb *iocb) 98{ 99 u64 id = bpf_get_current_pid_tgid(); 100 u32 pid = id >> 32; // PID is higher part 101 102 if (FILTER_PID) 103 return 0; 104 105 // store filep and timestamp by id 106 struct val_t val = {}; 107 val.ts = bpf_ktime_get_ns(); 108 val.fp = iocb->ki_filp; 109 val.offset = iocb->ki_pos; 110 if (val.fp) 111 entryinfo.update(&id, &val); 112 113 return 0; 114} 115 116// xfs_file_open(): 117int trace_open_entry(struct pt_regs *ctx, struct inode *inode, 118 struct file *file) 119{ 120 u64 id = bpf_get_current_pid_tgid(); 121 u32 pid = id >> 32; // PID is higher part 122 123 if (FILTER_PID) 124 return 0; 125 126 // store filep and timestamp by id 127 struct val_t val = {}; 128 val.ts = bpf_ktime_get_ns(); 129 val.fp = file; 130 val.offset = 0; 131 if (val.fp) 132 entryinfo.update(&id, &val); 133 134 return 0; 135} 136 137// xfs_file_fsync(): 138int trace_fsync_entry(struct pt_regs *ctx, struct file *file) 139{ 140 u64 id = bpf_get_current_pid_tgid(); 141 u32 pid = id >> 32; // PID is higher part 142 143 if (FILTER_PID) 144 return 0; 145 146 // store filep and timestamp by id 147 struct val_t val = {}; 148 val.ts = bpf_ktime_get_ns(); 149 val.fp = file; 150 val.offset = 0; 151 if (val.fp) 152 entryinfo.update(&id, &val); 153 154 return 0; 155} 156 157// 158// Output 159// 160 161static int trace_return(struct pt_regs *ctx, int type) 162{ 163 struct val_t *valp; 164 u64 id = bpf_get_current_pid_tgid(); 165 u32 pid = id >> 32; // PID is higher part 166 167 valp = entryinfo.lookup(&id); 168 if (valp == 0) { 169 // missed tracing issue or filtered 170 return 0; 171 } 172 173 // calculate delta 174 u64 ts = bpf_ktime_get_ns(); 175 u64 delta_us = ts - valp->ts; 176 entryinfo.delete(&id); 177 178 // Skip entries with backwards time: temp workaround for #728 179 if ((s64) delta_us < 0) 180 return 0; 181 182 delta_us /= 1000; 183 184 if (FILTER_US) 185 return 0; 186 187 // populate output struct 188 u32 size = PT_REGS_RC(ctx); 189 struct data_t data = {}; 190 data.type = type; 191 data.size = size; 192 data.delta_us = delta_us; 193 data.pid = pid; 194 data.ts_us = ts / 1000; 195 data.offset = valp->offset; 196 bpf_get_current_comm(&data.task, sizeof(data.task)); 197 198 // workaround (rewriter should handle file to d_name in one step): 199 struct qstr qs = valp->fp->f_path.dentry->d_name; 200 if (qs.len == 0) 201 return 0; 202 bpf_probe_read_kernel(&data.file, sizeof(data.file), (void *)qs.name); 203 204 // output 205 events.perf_submit(ctx, &data, sizeof(data)); 206 207 return 0; 208} 209 210int trace_read_return(struct pt_regs *ctx) 211{ 212 return trace_return(ctx, TRACE_READ); 213} 214 215int trace_write_return(struct pt_regs *ctx) 216{ 217 return trace_return(ctx, TRACE_WRITE); 218} 219 220int trace_open_return(struct pt_regs *ctx) 221{ 222 return trace_return(ctx, TRACE_OPEN); 223} 224 225int trace_fsync_return(struct pt_regs *ctx) 226{ 227 return trace_return(ctx, TRACE_FSYNC); 228} 229 230""" 231if min_ms == 0: 232 bpf_text = bpf_text.replace('FILTER_US', '0') 233else: 234 bpf_text = bpf_text.replace('FILTER_US', 235 'delta_us <= %s' % str(min_ms * 1000)) 236if args.pid: 237 bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid) 238else: 239 bpf_text = bpf_text.replace('FILTER_PID', '0') 240if debug or args.ebpf: 241 print(bpf_text) 242 if args.ebpf: 243 exit() 244 245# process event 246def print_event(cpu, data, size): 247 event = b["events"].event(data) 248 249 type = 'R' 250 if event.type == 1: 251 type = 'W' 252 elif event.type == 2: 253 type = 'O' 254 elif event.type == 3: 255 type = 'S' 256 257 if (csv): 258 print("%d,%s,%d,%s,%d,%d,%d,%s" % ( 259 event.ts_us, event.task, event.pid, type, event.size, 260 event.offset, event.delta_us, event.file)) 261 return 262 print("%-8s %-14.14s %-6s %1s %-7s %-8d %7.2f %s" % (strftime("%H:%M:%S"), 263 event.task, event.pid, type, event.size, event.offset / 1024, 264 float(event.delta_us) / 1000, event.file)) 265 266# initialize BPF 267b = BPF(text=bpf_text) 268 269# common file functions 270b.attach_kprobe(event="xfs_file_read_iter", fn_name="trace_rw_entry") 271b.attach_kprobe(event="xfs_file_write_iter", fn_name="trace_rw_entry") 272b.attach_kprobe(event="xfs_file_open", fn_name="trace_open_entry") 273b.attach_kprobe(event="xfs_file_fsync", fn_name="trace_fsync_entry") 274b.attach_kretprobe(event="xfs_file_read_iter", fn_name="trace_read_return") 275b.attach_kretprobe(event="xfs_file_write_iter", fn_name="trace_write_return") 276b.attach_kretprobe(event="xfs_file_open", fn_name="trace_open_return") 277b.attach_kretprobe(event="xfs_file_fsync", fn_name="trace_fsync_return") 278 279# header 280if (csv): 281 print("ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE") 282else: 283 if min_ms == 0: 284 print("Tracing XFS operations") 285 else: 286 print("Tracing XFS operations slower than %d ms" % min_ms) 287 print("%-8s %-14s %-6s %1s %-7s %-8s %7s %s" % ("TIME", "COMM", "PID", "T", 288 "BYTES", "OFF_KB", "LAT(ms)", "FILENAME")) 289 290# read events 291b["events"].open_perf_buffer(print_event, page_cnt=64) 292while 1: 293 try: 294 b.perf_buffer_poll() 295 except KeyboardInterrupt: 296 exit() 297