1#!/usr/bin/env bcc-lua 2--[[ 3Copyright 2016 GitHub, Inc 4 5Licensed under the Apache License, Version 2.0 (the "License"); 6you may not use this file except in compliance with the License. 7You may obtain a copy of the License at 8 9http://www.apache.org/licenses/LICENSE-2.0 10 11Unless required by applicable law or agreed to in writing, software 12distributed under the License is distributed on an "AS IS" BASIS, 13WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14See the License for the specific language governing permissions and 15limitations under the License. 16--]] 17 18local program = [[ 19#include <uapi/linux/ptrace.h> 20#include <linux/blkdev.h> 21 22struct val_t { 23 u32 pid; 24 char name[TASK_COMM_LEN]; 25}; 26 27struct data_t { 28 u32 pid; 29 u64 rwflag; 30 u64 delta; 31 u64 sector; 32 u64 len; 33 u64 ts; 34 char disk_name[DISK_NAME_LEN]; 35 char name[TASK_COMM_LEN]; 36}; 37 38BPF_HASH(start, struct request *); 39BPF_HASH(infobyreq, struct request *, struct val_t); 40BPF_PERF_OUTPUT(events); 41 42// cache PID and comm by-req 43int trace_pid_start(struct pt_regs *ctx, struct request *req) 44{ 45 struct val_t val = {}; 46 47 if (bpf_get_current_comm(&val.name, sizeof(val.name)) == 0) { 48 val.pid = bpf_get_current_pid_tgid(); 49 infobyreq.update(&req, &val); 50 } 51 return 0; 52} 53 54// time block I/O 55int trace_req_start(struct pt_regs *ctx, struct request *req) 56{ 57 u64 ts; 58 59 ts = bpf_ktime_get_ns(); 60 start.update(&req, &ts); 61 62 return 0; 63} 64 65// output 66int trace_req_completion(struct pt_regs *ctx, struct request *req) 67{ 68 u64 *tsp, delta; 69 u32 *pidp = 0; 70 struct val_t *valp; 71 struct data_t data ={}; 72 u64 ts; 73 74 // fetch timestamp and calculate delta 75 tsp = start.lookup(&req); 76 if (tsp == 0) { 77 // missed tracing issue 78 return 0; 79 } 80 ts = bpf_ktime_get_ns(); 81 data.delta = ts - *tsp; 82 data.ts = ts / 1000; 83 84 valp = infobyreq.lookup(&req); 85 if (valp == 0) { 86 data.len = req->__data_len; 87 data.name[0] = '?'; 88 data.name[1] = 0; 89 } else { 90 data.pid = valp->pid; 91 data.len = req->__data_len; 92 data.sector = req->__sector; 93 bpf_probe_read_kernel(&data.name, sizeof(data.name), valp->name); 94 bpf_probe_read_kernel(&data.disk_name, sizeof(data.disk_name), 95 req->rq_disk->disk_name); 96 } 97 98/* 99 * The following deals with a kernel version change (in mainline 4.7, although 100 * it may be backported to earlier kernels) with how block request write flags 101 * are tested. We handle both pre- and post-change versions here. Please avoid 102 * kernel version tests like this as much as possible: they inflate the code, 103 * test, and maintenance burden. 104 */ 105#ifdef REQ_WRITE 106 data.rwflag = !!(req->cmd_flags & REQ_WRITE); 107#elif defined(REQ_OP_SHIFT) 108 data.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE); 109#else 110 data.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE); 111#endif 112 113 events.perf_submit(ctx,&data,sizeof(data)); 114 start.delete(&req); 115 infobyreq.delete(&req); 116 117 return 0; 118} 119]] 120 121local ffi = require("ffi") 122 123return function(BPF, utils) 124 local bpf = BPF:new{text=program} 125 126 bpf:attach_kprobe{event="blk_account_io_start", fn_name="trace_pid_start"} 127 bpf:attach_kprobe{event="blk_start_request", fn_name="trace_req_start"} 128 bpf:attach_kprobe{event="blk_mq_start_request", fn_name="trace_req_start"} 129 bpf:attach_kprobe{event="blk_account_io_done", 130 fn_name="trace_req_completion"} 131 132 print("%-14s %-14s %-6s %-7s %-2s %-9s %-7s %7s" % {"TIME(s)", "COMM", "PID", 133 "DISK", "T", "SECTOR", "BYTES", "LAT(ms)"}) 134 135 local rwflg = "" 136 local start_ts = 0 137 local prev_ts = 0 138 local delta = 0 139 140 local function print_event(cpu, event) 141 local val = -1 142 local event_pid = event.pid 143 local event_delta = tonumber(event.delta) 144 local event_sector = tonumber(event.sector) 145 local event_len = tonumber(event.len) 146 local event_ts = tonumber(event.ts) 147 local event_disk_name = ffi.string(event.disk_name) 148 local event_name = ffi.string(event.name) 149 150 if event.rwflag == 1 then 151 rwflg = "W" 152 end 153 154 if event.rwflag == 0 then 155 rwflg = "R" 156 end 157 158 if not event_name:match("%?") then 159 val = event_sector 160 end 161 162 if start_ts == 0 then 163 prev_ts = start_ts 164 end 165 166 if start_ts == 1 then 167 delta = delta + (event_ts - prev_ts) 168 end 169 170 print("%-14.9f %-14.14s %-6s %-7s %-2s %-9s %-7s %7.2f" % { 171 delta / 1000000, event_name, event_pid, event_disk_name, rwflg, val, 172 event_len, event_delta / 1000000}) 173 174 prev_ts = event_ts 175 start_ts = 1 176 end 177 178 local TASK_COMM_LEN = 16 -- linux/sched.h 179 local DISK_NAME_LEN = 32 -- linux/genhd.h 180 181 bpf:get_table("events"):open_perf_buffer(print_event, [[ 182 struct { 183 uint32_t pid; 184 uint64_t rwflag; 185 uint64_t delta; 186 uint64_t sector; 187 uint64_t len; 188 uint64_t ts; 189 char disk_name[$]; 190 char name[$]; 191 } 192 ]], {DISK_NAME_LEN, TASK_COMM_LEN}, 64) 193 bpf:perf_buffer_poll_loop() 194end 195